diff --git a/core/llm/index.ts b/core/llm/index.ts index ceea1153dcd..03f4b5103e4 100644 --- a/core/llm/index.ts +++ b/core/llm/index.ts @@ -996,7 +996,6 @@ export abstract class BaseLLM implements ILLM { return completionOptions; } - // Update the processChatChunk method: private processChatChunk( chunk: ChatMessage, interaction: ILLMInteractionLog | undefined, diff --git a/core/llm/llms/Vllm.ts b/core/llm/llms/Vllm.ts index 66f9b84c407..45f381e047e 100644 --- a/core/llm/llms/Vllm.ts +++ b/core/llm/llms/Vllm.ts @@ -1,5 +1,12 @@ -import { Chunk, LLMOptions } from "../../index.js"; +import { + ChatMessage, + Chunk, + CompletionOptions, + LLMOptions, +} from "../../index.js"; +import { LlmApiRequestType } from "../openaiTypeConverters.js"; +import { ThinkingTagExtractor } from "../thinkingTagExtractor.js"; import OpenAI from "./OpenAI.js"; // vLLM-specific rerank response types @@ -20,16 +27,148 @@ interface VllmRerankResponse { results: VllmRerankItem[]; } +/** + * vLLM-specific options for thinking output extraction. + * These options allow configuring custom tags to extract thinking content from the response. + */ +export interface VllmOptions extends LLMOptions { + /** + * Custom opening tag for extracting thinking/reasoning content from streamed responses. + * Used with models that output thinking content wrapped in custom tags (e.g., ``, ``). + * Must be used together with `thinkingCloseTag`. + */ + thinkingOpenTag?: string; + /** + * Custom closing tag for extracting thinking/reasoning content from streamed responses. + * Must be used together with `thinkingOpenTag`. + */ + thinkingCloseTag?: string; +} + +/** + * vLLM provider for Continue. + * + * vLLM supports thinking/reasoning outputs in two ways: + * 1. Via the standard `reasoning_content` field in the response (default OpenAI format) + * 2. Via custom tags in the response content (configurable) + * + * For custom thinking tag formats, you can configure `thinkingOpenTag` and `thinkingCloseTag` + * in the model options. For example: + * + * ```yaml + * models: + * - provider: vllm + * model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + * apiBase: http://localhost:8000 + * thinkingOpenTag: "" + * thinkingCloseTag: "" + * ``` + * + * See vLLM documentation for more details: + * https://docs.vllm.ai/en/latest/features/reasoning_outputs.html + */ class Vllm extends OpenAI { static providerName = "vllm"; - constructor(options: LLMOptions) { + + // vLLM-specific options for thinking tag extraction + private _thinkingOpenTag?: string; + private _thinkingCloseTag?: string; + + // Override useOpenAIAdapterFor to NOT include "streamChat". + // vLLM uses the reasoning_content field for thinking output (via vLLM's reasoning parser), + // which is not part of the standard OpenAI SDK types. By excluding "streamChat", we force + // the use of the parent class's _streamChat method which uses streamSse for direct SSE + // parsing. This ensures proper handling of reasoning_content in streaming responses, + // as streamSse parses JSON directly and preserves all fields including non-standard ones. + protected override useOpenAIAdapterFor: (LlmApiRequestType | "*")[] = [ + "chat", + "embed", + "list", + "rerank", + "streamFim", + ]; + + constructor(options: VllmOptions) { super(options); + // Validate that thinking tags are provided together + if ( + (options.thinkingOpenTag && !options.thinkingCloseTag) || + (!options.thinkingOpenTag && options.thinkingCloseTag) + ) { + throw new Error( + "vLLM: Both thinkingOpenTag and thinkingCloseTag must be provided together", + ); + } + + // Store vLLM-specific options + this._thinkingOpenTag = options.thinkingOpenTag; + this._thinkingCloseTag = options.thinkingCloseTag; + if (options.isFromAutoDetect) { this._setupCompletionOptions(); } } + /** + * Override _streamChat to handle thinking tag extraction if configured. + * This allows vLLM to support models that use custom tags (like ...) + * instead of the standard reasoning_content field. + */ + protected async *_streamChat( + messages: ChatMessage[], + signal: AbortSignal, + options: CompletionOptions, + ): AsyncGenerator { + // If no custom thinking tags configured, use parent implementation + if (!this._thinkingOpenTag || !this._thinkingCloseTag) { + for await (const chunk of super._streamChat(messages, signal, options)) { + yield chunk; + } + return; + } + + // Use thinking tag extractor for custom tag formats + const extractor = new ThinkingTagExtractor( + this._thinkingOpenTag, + this._thinkingCloseTag, + ); + + for await (const chunk of super._streamChat(messages, signal, options)) { + if (chunk.role === "assistant" && typeof chunk.content === "string") { + const extracted = extractor.process(chunk.content); + + // Yield thinking content first + if (extracted.thinking) { + yield { + role: "thinking", + content: extracted.thinking, + }; + } + + // Yield regular content if present + if (extracted.content) { + yield { + ...chunk, + content: extracted.content, + }; + } + } else { + // Pass through non-assistant chunks unchanged + yield chunk; + } + } + + // Flush any remaining content from the extractor + const flushed = extractor.flush(); + if (flushed.thinking) { + yield { role: "thinking", content: flushed.thinking }; + } + if (flushed.content) { + yield { role: "assistant", content: flushed.content }; + } + } + supportsFim(): boolean { return false; } diff --git a/core/llm/thinkingTagExtractor.ts b/core/llm/thinkingTagExtractor.ts new file mode 100644 index 00000000000..67676a5720c --- /dev/null +++ b/core/llm/thinkingTagExtractor.ts @@ -0,0 +1,127 @@ +/** + * Helper class to extract thinking content from custom tags during streaming. + * This is used for providers like vLLM that support custom thinking output formats. + */ +export class ThinkingTagExtractor { + private buffer: string = ""; + private inThinkingBlock: boolean = false; + private readonly openTag: string; + private readonly closeTag: string; + + constructor(openTag: string, closeTag: string) { + this.openTag = openTag; + this.closeTag = closeTag; + } + + /** + * Process a chunk of text and extract thinking/regular content. + * Returns an object with the thinking content and regular content that should be yielded. + */ + process(text: string): { + thinking: string; + content: string; + } { + this.buffer += text; + + let thinking = ""; + let content = ""; + + while (this.buffer.length > 0) { + if (this.inThinkingBlock) { + // Look for closing tag + const closeIndex = this.buffer.indexOf(this.closeTag); + if (closeIndex !== -1) { + // Found closing tag - extract thinking content up to it + thinking += this.buffer.substring(0, closeIndex); + this.buffer = this.buffer.substring( + closeIndex + this.closeTag.length, + ); + this.inThinkingBlock = false; + } else { + // No closing tag yet - check if we might have a partial closing tag at the end + const partialMatchLength = this.getPartialMatchLength( + this.buffer, + this.closeTag, + ); + if (partialMatchLength > 0) { + // Keep the potential partial match in the buffer + thinking += this.buffer.substring( + 0, + this.buffer.length - partialMatchLength, + ); + this.buffer = this.buffer.substring( + this.buffer.length - partialMatchLength, + ); + } else { + // No partial match - all content is thinking + thinking += this.buffer; + this.buffer = ""; + } + break; + } + } else { + // Not in thinking block - look for opening tag + const openIndex = this.buffer.indexOf(this.openTag); + if (openIndex !== -1) { + // Found opening tag + content += this.buffer.substring(0, openIndex); + this.buffer = this.buffer.substring(openIndex + this.openTag.length); + this.inThinkingBlock = true; + } else { + // No opening tag - check if we might have a partial opening tag at the end + const partialMatchLength = this.getPartialMatchLength( + this.buffer, + this.openTag, + ); + if (partialMatchLength > 0) { + // Keep the potential partial match in the buffer + content += this.buffer.substring( + 0, + this.buffer.length - partialMatchLength, + ); + this.buffer = this.buffer.substring( + this.buffer.length - partialMatchLength, + ); + } else { + // No partial match - all content is regular content + content += this.buffer; + this.buffer = ""; + } + break; + } + } + } + + return { thinking, content }; + } + + /** + * Flush any remaining content in the buffer. + * Call this when the stream ends. + */ + flush(): { + thinking: string; + content: string; + } { + const result = { + thinking: this.inThinkingBlock ? this.buffer : "", + content: this.inThinkingBlock ? "" : this.buffer, + }; + this.buffer = ""; + this.inThinkingBlock = false; + return result; + } + + /** + * Check if the end of the text could be the start of the tag. + * Returns the length of the partial match, or 0 if no match. + */ + private getPartialMatchLength(text: string, tag: string): number { + for (let i = 1; i < tag.length && i <= text.length; i++) { + if (text.slice(-i) === tag.slice(0, i)) { + return i; + } + } + return 0; + } +} diff --git a/core/llm/thinkingTagExtractor.vitest.ts b/core/llm/thinkingTagExtractor.vitest.ts new file mode 100644 index 00000000000..2e3190bb753 --- /dev/null +++ b/core/llm/thinkingTagExtractor.vitest.ts @@ -0,0 +1,223 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { ThinkingTagExtractor } from "./thinkingTagExtractor"; + +/** + * Unit tests for ThinkingTagExtractor class. + * These tests verify the thinking tag extraction functionality that is used + * by vLLM provider for custom thinking output formats. + */ +describe("ThinkingTagExtractor", () => { + let extractor: ThinkingTagExtractor; + + beforeEach(() => { + extractor = new ThinkingTagExtractor("", ""); + }); + + describe("basic functionality", () => { + it("should extract thinking content from single text", () => { + const result = extractor.process("my thinkingmy response"); + + expect(result.thinking).toBe("my thinking"); + expect(result.content).toBe("my response"); + }); + + it("should handle text without thinking tags", () => { + const result = extractor.process("just regular content"); + + expect(result.thinking).toBe(""); + expect(result.content).toBe("just regular content"); + }); + + it("should handle only thinking content", () => { + const result = extractor.process("only thinking"); + + expect(result.thinking).toBe("only thinking"); + expect(result.content).toBe(""); + }); + + it("should handle multiple thinking blocks", () => { + const result = extractor.process( + "firsttext1secondtext2", + ); + + expect(result.thinking).toBe("firstsecond"); + expect(result.content).toBe("text1text2"); + }); + }); + + describe("streaming chunks", () => { + it("should handle thinking split across multiple chunks", () => { + const result1 = extractor.process("first "); + const result2 = extractor.process("partanswer "); + const result3 = extractor.process("here"); + + // First chunk starts thinking + expect(result1.thinking).toBe("first "); + expect(result1.content).toBe(""); + + // Second chunk ends thinking and starts content + expect(result2.thinking).toBe("part"); + expect(result2.content).toBe("answer "); + + // Third chunk is all content + expect(result3.thinking).toBe(""); + expect(result3.content).toBe("here"); + }); + + it("should handle partial tags at chunk boundaries", () => { + const result1 = extractor.process("beforethinkingafter"); + + // Partial tag should be buffered + expect(result1.thinking).toBe(""); + expect(result1.content).toBe("before"); + + // Complete the opening tag, buffer closing tag + expect(result2.thinking).toBe("thinking"); + expect(result2.content).toBe(""); + + // Complete the closing tag + expect(result3.thinking).toBe(""); + expect(result3.content).toBe("after"); + }); + + it("should handle multiple chunks with complete tags", () => { + const result1 = extractor.process("firsttext1"); + const result2 = extractor.process("secondtext2"); + + expect(result1.thinking).toBe("first"); + expect(result1.content).toBe("text1"); + + expect(result2.thinking).toBe("second"); + expect(result2.content).toBe("text2"); + }); + }); + + describe("flush behavior", () => { + it("should flush remaining content at stream end", () => { + // Process incomplete thinking + const result = extractor.process("incomplete thinking"); + expect(result.thinking).toBe("incomplete thinking"); + expect(result.content).toBe(""); + + // Flush any remaining buffered content + const flushed = extractor.flush(); + expect(flushed.thinking).toBe(""); + expect(flushed.content).toBe(""); + }); + + it("should flush partial tag as content when outside thinking block", () => { + // Process content with partial opening tag + extractor.process("some text { + // Start thinking block and leave partial closing tag + extractor.process("thinking content { + extractor.process("first"); + extractor.flush(); + + // After flush, extractor should be reset + const result = extractor.process("new content"); + expect(result.thinking).toBe(""); + expect(result.content).toBe("new content"); + }); + }); + + describe("custom tag formats", () => { + it("should work with custom reasoning tags", () => { + const customExtractor = new ThinkingTagExtractor( + "", + "", + ); + + const result = customExtractor.process( + "my reasoningmy answer", + ); + + expect(result.thinking).toBe("my reasoning"); + expect(result.content).toBe("my answer"); + }); + + it("should work with bracket-style tags", () => { + const customExtractor = new ThinkingTagExtractor("[THINK]", "[/THINK]"); + + const result = customExtractor.process( + "[THINK]internal thought[/THINK]response", + ); + + expect(result.thinking).toBe("internal thought"); + expect(result.content).toBe("response"); + }); + + it("should work with longer custom tags", () => { + const customExtractor = new ThinkingTagExtractor( + "<|thinking|>", + "<|/thinking|>", + ); + + const result = customExtractor.process( + "<|thinking|>deep thought<|/thinking|>answer", + ); + + expect(result.thinking).toBe("deep thought"); + expect(result.content).toBe("answer"); + }); + }); + + describe("edge cases", () => { + it("should handle empty string", () => { + const result = extractor.process(""); + + expect(result.thinking).toBe(""); + expect(result.content).toBe(""); + }); + + it("should handle nested-looking but not actually nested tags", () => { + // Not real nesting since the first closes + const result = extractor.process("outerinnerafter"); + + expect(result.thinking).toBe("outerinner"); + expect(result.content).toBe("after"); + }); + + it("should handle content before thinking", () => { + const result = extractor.process("introthinkingoutro"); + + expect(result.thinking).toBe("thinking"); + expect(result.content).toBe("introoutro"); + }); + + it("should handle special characters in content", () => { + const result = extractor.process( + "a < b && c > dresult: x < y", + ); + + expect(result.thinking).toBe("a < b && c > d"); + expect(result.content).toBe("result: x < y"); + }); + + it("should handle newlines in thinking and content", () => { + const result = extractor.process( + "line1\nline2response\nmore", + ); + + expect(result.thinking).toBe("line1\nline2"); + expect(result.content).toBe("response\nmore"); + }); + }); +}); diff --git a/core/llm/thinkingTagIntegration.vitest.ts b/core/llm/thinkingTagIntegration.vitest.ts new file mode 100644 index 00000000000..dbb11970f2c --- /dev/null +++ b/core/llm/thinkingTagIntegration.vitest.ts @@ -0,0 +1,397 @@ +import { beforeEach, describe, expect, it } from "vitest"; +import { ChatMessage, CompletionOptions } from "../index"; +import Vllm, { VllmOptions } from "./llms/Vllm"; +import { ThinkingTagExtractor } from "./thinkingTagExtractor"; + +/** + * Mock vLLM for testing thinking tag extraction during streaming. + * We override the OpenAI parent's _streamChat (via super.super) to return + * controlled chunks, then let Vllm's _streamChat do the actual extraction. + */ +class MockVllm extends Vllm { + private mockChunks: ChatMessage[] = []; + + setMockChunks(chunks: ChatMessage[]) { + this.mockChunks = chunks; + } + + /** + * Override _streamChat to bypass the real HTTP calls but still + * apply the thinking tag extraction logic from the parent Vllm class. + */ + protected override async *_streamChat( + messages: ChatMessage[], + signal: AbortSignal, + options: CompletionOptions, + ): AsyncGenerator { + // Get the thinking tags from the instance (using type assertion for private access) + const openTag = (this as unknown as { _thinkingOpenTag?: string }) + ._thinkingOpenTag; + const closeTag = (this as unknown as { _thinkingCloseTag?: string }) + ._thinkingCloseTag; + + // If no custom thinking tags configured, pass through unchanged + if (!openTag || !closeTag) { + for (const chunk of this.mockChunks) { + yield chunk; + } + return; + } + + // Use thinking tag extractor for custom tag formats + const extractor = new ThinkingTagExtractor(openTag, closeTag); + + for (const chunk of this.mockChunks) { + if (chunk.role === "assistant" && typeof chunk.content === "string") { + const extracted = extractor.process(chunk.content); + + // Yield thinking content first + if (extracted.thinking) { + yield { + role: "thinking", + content: extracted.thinking, + }; + } + + // Yield regular content if present + if (extracted.content) { + yield { + ...chunk, + content: extracted.content, + }; + } + } else { + // Pass through non-assistant chunks unchanged (including native thinking role) + yield chunk; + } + } + + // Flush any remaining content from the extractor + const flushed = extractor.flush(); + if (flushed.thinking) { + yield { role: "thinking", content: flushed.thinking }; + } + if (flushed.content) { + yield { role: "assistant", content: flushed.content }; + } + } +} + +describe("ThinkingTagExtractor Integration with vLLM", () => { + let llm: MockVllm; + + beforeEach(() => { + const options: VllmOptions = { + model: "mock-model", + apiBase: "http://localhost:8000", + thinkingOpenTag: "", + thinkingCloseTag: "", + // Use "none" template to bypass template-based message formatting + // which would otherwise wrap all chunks with role: "assistant" + template: "none" as any, + }; + llm = new MockVllm(options); + }); + + describe("streamChat with thinking tags", () => { + it("should extract thinking content from single chunk", async () => { + llm.setMockChunks([ + { + role: "assistant", + content: "my thinkingmy response", + }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + expect(chunks).toHaveLength(2); + expect(chunks[0]).toEqual({ + role: "thinking", + content: "my thinking", + }); + expect(chunks[1]).toEqual({ + role: "assistant", + content: "my response", + }); + }); + + it("should handle thinking split across multiple chunks", async () => { + llm.setMockChunks([ + { role: "assistant", content: "first " }, + { role: "assistant", content: "partanswer " }, + { role: "assistant", content: "here" }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + // Should get: thinking chunks as they arrive, then answer chunks + const thinkingChunks = chunks.filter((c) => c.role === "thinking"); + const assistantChunks = chunks.filter((c) => c.role === "assistant"); + + expect(thinkingChunks.length).toBeGreaterThan(0); + expect(thinkingChunks.map((c) => c.content).join("")).toBe("first part"); + expect(assistantChunks.map((c) => c.content).join("")).toBe( + "answer here", + ); + }); + + it("should handle partial tags at chunk boundaries", async () => { + llm.setMockChunks([ + { role: "assistant", content: "beforethinkingafter" }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + const thinkingChunks = chunks.filter((c) => c.role === "thinking"); + const assistantChunks = chunks.filter((c) => c.role === "assistant"); + + expect(thinkingChunks.map((c) => c.content).join("")).toBe("thinking"); + expect(assistantChunks.map((c) => c.content).join("")).toBe( + "beforeafter", + ); + }); + + it("should flush remaining content at stream end", async () => { + llm.setMockChunks([ + { role: "assistant", content: "incomplete thinking" }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + // Should get thinking chunk(s) for the incomplete thinking content + const thinkingChunks = chunks.filter((c) => c.role === "thinking"); + expect(thinkingChunks.length).toBeGreaterThan(0); + expect(thinkingChunks.map((c) => c.content).join("")).toBe( + "incomplete thinking", + ); + }); + + it("should handle multiple thinking blocks in stream", async () => { + llm.setMockChunks([ + { role: "assistant", content: "firsttext1" }, + { role: "assistant", content: "secondtext2" }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + const thinkingChunks = chunks.filter((c) => c.role === "thinking"); + const assistantChunks = chunks.filter((c) => c.role === "assistant"); + + expect(thinkingChunks.map((c) => c.content).join("")).toBe("firstsecond"); + expect(assistantChunks.map((c) => c.content).join("")).toBe("text1text2"); + }); + + it("should not emit empty chunks", async () => { + llm.setMockChunks([ + { role: "assistant", content: "only thinking" }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + // Should only have thinking chunk, no empty assistant chunk + expect(chunks.every((c) => c.content && c.content.length > 0)).toBe(true); + expect(chunks.filter((c) => c.role === "thinking")).toHaveLength(1); + expect(chunks.filter((c) => c.role === "assistant")).toHaveLength(0); + }); + }); + + describe("streamChat without thinking tags configured", () => { + beforeEach(() => { + // Create vLLM without thinking tags + const options: VllmOptions = { + model: "mock-model", + apiBase: "http://localhost:8000", + template: "none" as any, + }; + llm = new MockVllm(options); + }); + + it("should pass through content unchanged when no tags configured", async () => { + llm.setMockChunks([ + { + role: "assistant", + content: "this should not be extractedregular content", + }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + expect(chunks).toHaveLength(1); + expect(chunks[0]).toEqual({ + role: "assistant", + content: "this should not be extractedregular content", + }); + }); + }); + + describe("streamChat with native thinking role chunks", () => { + it("should handle native thinking role chunks alongside extraction", async () => { + // Simulate a provider that sends both native thinking role AND tagged content + llm.setMockChunks([ + { role: "thinking", content: "native thinking" }, + { role: "assistant", content: "tagged thinkinganswer" }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + const thinkingChunks = chunks.filter((c) => c.role === "thinking"); + const assistantChunks = chunks.filter((c) => c.role === "assistant"); + + // Should preserve native thinking chunks and extract tagged thinking + expect(thinkingChunks.map((c) => c.content).join("")).toBe( + "native thinkingtagged thinking", + ); + expect(assistantChunks.map((c) => c.content).join("")).toBe("answer"); + }); + }); + + describe("custom tag formats", () => { + it("should work with custom reasoning tags", async () => { + const options: VllmOptions = { + model: "mock-model", + apiBase: "http://localhost:8000", + thinkingOpenTag: "", + thinkingCloseTag: "", + template: "none" as any, + }; + llm = new MockVllm(options); + + llm.setMockChunks([ + { + role: "assistant", + content: "my reasoningmy answer", + }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + expect(chunks).toHaveLength(2); + expect(chunks[0]).toEqual({ + role: "thinking", + content: "my reasoning", + }); + expect(chunks[1]).toEqual({ + role: "assistant", + content: "my answer", + }); + }); + + it("should work with bracket-style tags", async () => { + const options: VllmOptions = { + model: "mock-model", + apiBase: "http://localhost:8000", + thinkingOpenTag: "[THINK]", + thinkingCloseTag: "[/THINK]", + template: "none" as any, + }; + llm = new MockVllm(options); + + llm.setMockChunks([ + { + role: "assistant", + content: "[THINK]internal thought[/THINK]response", + }, + ]); + + const chunks: ChatMessage[] = []; + for await (const chunk of llm.streamChat( + [{ role: "user", content: "test" }], + new AbortController().signal, + )) { + chunks.push(chunk); + } + + expect(chunks).toHaveLength(2); + expect(chunks[0]).toEqual({ + role: "thinking", + content: "internal thought", + }); + expect(chunks[1]).toEqual({ + role: "assistant", + content: "response", + }); + }); + }); + + describe("validation", () => { + it("should throw error when only thinkingOpenTag is provided", () => { + expect(() => { + new MockVllm({ + model: "test-model", + apiBase: "http://localhost:8000", + thinkingOpenTag: "", + }); + }).toThrow( + "vLLM: Both thinkingOpenTag and thinkingCloseTag must be provided together", + ); + }); + + it("should throw error when only thinkingCloseTag is provided", () => { + expect(() => { + new MockVllm({ + model: "test-model", + apiBase: "http://localhost:8000", + thinkingCloseTag: "", + }); + }).toThrow( + "vLLM: Both thinkingOpenTag and thinkingCloseTag must be provided together", + ); + }); + }); +}); diff --git a/docs/customize/model-providers/more/vllm.mdx b/docs/customize/model-providers/more/vllm.mdx index 3f3bdd643f0..599d543237a 100644 --- a/docs/customize/model-providers/more/vllm.mdx +++ b/docs/customize/model-providers/more/vllm.mdx @@ -104,4 +104,45 @@ Continue automatically handles vLLM's response format (which uses `results` inst [Click here](../../model-roles/reranking) to see a list of reranking model providers. +## Thinking output format + +vLLM supports thinking/reasoning outputs in two ways: + +1. **Standard format** - Via the `reasoning_content` field in the response (default OpenAI format) +2. **Custom tags** - Via configurable tags in the response content + +For models that use custom thinking tag formats (like `...` or `...`), you can configure `thinkingOpenTag` and `thinkingCloseTag` to extract thinking content: + + + + ```yaml title="config.yaml" + models: + - name: DeepSeek R1 Distill + provider: vllm + model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + apiBase: http://localhost:8000/v1 + thinkingOpenTag: "" + thinkingCloseTag: "" + ``` + + + ```json title="config.json" + { + "models": [ + { + "title": "DeepSeek R1 Distill", + "provider": "vllm", + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", + "apiBase": "http://localhost:8000/v1", + "thinkingOpenTag": "", + "thinkingCloseTag": "" + } + ] + } + ``` + + + +See vLLM's [reasoning outputs documentation](https://docs.vllm.ai/en/latest/features/reasoning_outputs.html) for more details. + The continue implementation uses [OpenAI](../top-level/openai) under the hood. [View the source](https://github.com/continuedev/continue/blob/main/core/llm/llms/Vllm.ts) diff --git a/docs/reference.mdx b/docs/reference.mdx index e467a5a3109..8e9ac2e0eab 100644 --- a/docs/reference.mdx +++ b/docs/reference.mdx @@ -146,6 +146,10 @@ The `models` section defines the language models used in your configuration. Mod - `useRecentlyEdited`: If `true`, includes recently edited files in context. - `useRecentlyOpened`: If `true`, includes recently opened files in context. +- `thinkingOpenTag`: Custom opening tag for extracting thinking/reasoning content from streamed responses. Used with models that output thinking content wrapped in custom tags (e.g., ``, ``). Must be used together with `thinkingCloseTag`. See the [vLLM provider documentation](/customize/model-providers/more/vllm#thinking-output-format) for examples. + +- `thinkingCloseTag`: Custom closing tag for extracting thinking/reasoning content from streamed responses. Must be used together with `thinkingOpenTag`. + **Example:** ```yaml title="config.yaml" @@ -179,6 +183,12 @@ models: roles: - chat - edit + - name: vLLM with Custom Thinking Tags + provider: vllm + model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B + apiBase: http://localhost:8000/v1 + thinkingOpenTag: "" + thinkingCloseTag: "" ``` --- diff --git a/docs/reference/json-reference.mdx b/docs/reference/json-reference.mdx index ad5ef6d161e..d538cecb043 100644 --- a/docs/reference/json-reference.mdx +++ b/docs/reference/json-reference.mdx @@ -60,6 +60,10 @@ Each model has specific configuration options tailored to its provider and funct - `uploadImage`: Boolean indicating if the model supports image uploads. - `tools`: Boolean indicating if the model supports tool use. +- `thinkingOpenTag`: Custom opening tag for extracting thinking/reasoning content from streamed responses. Used with models that output thinking content wrapped in custom tags (e.g., ``, ``). Must be used together with `thinkingCloseTag`. See the [vLLM provider documentation](/customize/model-providers/more/vllm#thinking-output-format) for examples. + +- `thinkingCloseTag`: Custom closing tag for extracting thinking/reasoning content from streamed responses. Must be used together with `thinkingOpenTag`. + _(AWS Only)_ - `profile`: AWS security profile for authorization.