Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions core/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,12 @@ export interface LLMOptions {

sourceFile?: string;
isFromAutoDetect?: boolean;

// Thinking output format options
// These allow configuring custom tags to extract thinking content from the response
// For example, vLLM can use <think>...</think> tags instead of the standard reasoning_content field
thinkingOpenTag?: string;
thinkingCloseTag?: string;
}

type RequireAtLeastOne<T, Keys extends keyof T = keyof T> = Pick<
Expand Down
253 changes: 247 additions & 6 deletions core/llm/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,134 @@

type InteractionStatus = "in_progress" | "success" | "error" | "cancelled";

/**
* Helper class to extract thinking content from custom tags during streaming.
* This is used for providers like vLLM that support custom thinking output formats.
*/
export class ThinkingTagExtractor {
private buffer: string = "";
private inThinkingBlock: boolean = false;
private readonly openTag: string;
private readonly closeTag: string;

constructor(openTag: string, closeTag: string) {
this.openTag = openTag;
this.closeTag = closeTag;
}

/**
* Process a chunk of text and extract thinking/regular content.
* Returns an object with the thinking content and regular content that should be yielded.
*/
process(text: string): {
thinking: string;
content: string;
} {
this.buffer += text;

let thinking = "";
let content = "";

while (this.buffer.length > 0) {
if (this.inThinkingBlock) {
// Look for closing tag
const closeIndex = this.buffer.indexOf(this.closeTag);
if (closeIndex !== -1) {
// Found closing tag - extract thinking content up to it
thinking += this.buffer.substring(0, closeIndex);
this.buffer = this.buffer.substring(
closeIndex + this.closeTag.length,
);
this.inThinkingBlock = false;
} else {
// No closing tag yet - check if we might have a partial closing tag at the end
const partialMatchLength = this.getPartialMatchLength(
this.buffer,
this.closeTag,
);
if (partialMatchLength > 0) {
// Keep the potential partial match in the buffer
thinking += this.buffer.substring(
0,
this.buffer.length - partialMatchLength,
);
this.buffer = this.buffer.substring(
this.buffer.length - partialMatchLength,
);
} else {
// No partial match - all content is thinking
thinking += this.buffer;
this.buffer = "";
}
break;
}
} else {
// Not in thinking block - look for opening tag
const openIndex = this.buffer.indexOf(this.openTag);
if (openIndex !== -1) {
// Found opening tag
content += this.buffer.substring(0, openIndex);
this.buffer = this.buffer.substring(openIndex + this.openTag.length);
this.inThinkingBlock = true;
} else {
// No opening tag - check if we might have a partial opening tag at the end
const partialMatchLength = this.getPartialMatchLength(
this.buffer,
this.openTag,
);
if (partialMatchLength > 0) {
// Keep the potential partial match in the buffer
content += this.buffer.substring(
0,
this.buffer.length - partialMatchLength,
);
this.buffer = this.buffer.substring(
this.buffer.length - partialMatchLength,
);
} else {
// No partial match - all content is regular content
content += this.buffer;
this.buffer = "";
}
break;
}
}
}

return { thinking, content };
}

/**
* Flush any remaining content in the buffer.
* Call this when the stream ends.
*/
flush(): {
thinking: string;
content: string;
} {
const result = {
thinking: this.inThinkingBlock ? this.buffer : "",
content: this.inThinkingBlock ? "" : this.buffer,
};
this.buffer = "";
this.inThinkingBlock = false;
return result;
}

/**
* Check if the end of the text could be the start of the tag.
* Returns the length of the partial match, or 0 if no match.
*/
private getPartialMatchLength(text: string, tag: string): number {
for (let i = 1; i < tag.length && i <= text.length; i++) {
if (text.slice(-i) === tag.slice(0, i)) {
return i;
}
}
return 0;
}
}

export abstract class BaseLLM implements ILLM {
static providerName: string;
static defaultOptions: Partial<LLMOptions> | undefined = undefined;
Expand Down Expand Up @@ -196,6 +324,10 @@

isFromAutoDetect?: boolean;

// Thinking output format options
thinkingOpenTag?: string;
thinkingCloseTag?: string;

lastRequestId: string | undefined;

private _llmOptions: LLMOptions;
Expand Down Expand Up @@ -303,6 +435,10 @@
this.autocompleteOptions = options.autocompleteOptions;
this.sourceFile = options.sourceFile;
this.isFromAutoDetect = options.isFromAutoDetect;

// Thinking output format options
this.thinkingOpenTag = options.thinkingOpenTag;
this.thinkingCloseTag = options.thinkingCloseTag;
}

get contextLength() {
Expand Down Expand Up @@ -1000,18 +1136,50 @@
private processChatChunk(
chunk: ChatMessage,
interaction: ILLMInteractionLog | undefined,
thinkingExtractor?: ThinkingTagExtractor,
): {
completion: string[];
thinking: string[];
usage: Usage | null;
chunk: ChatMessage;
thinkingChunk?: ChatMessage;
} {
const completion: string[] = [];
const thinking: string[] = [];
let usage: Usage | null = null;
let outputChunk = chunk;
let thinkingChunk: ChatMessage | undefined;

if (chunk.role === "assistant") {
completion.push(this._formatChatMessage(chunk));
// If we have a thinking extractor, process the content through it
if (thinkingExtractor && typeof chunk.content === "string") {
const extracted = thinkingExtractor.process(chunk.content);

if (extracted.thinking) {
thinking.push(extracted.thinking);
thinkingChunk = {
role: "thinking",
content: extracted.thinking,
};
}

if (extracted.content) {
const processedChunk: ChatMessage = {
...chunk,
content: extracted.content,
};
completion.push(this._formatChatMessage(processedChunk));
outputChunk = processedChunk;
} else {
// No regular content in this chunk, just thinking
outputChunk = {
...chunk,
content: "",
};
}
} else {
completion.push(this._formatChatMessage(chunk));
}
} else if (chunk.role === "thinking" && typeof chunk.content === "string") {
thinking.push(chunk.content);
}
Expand All @@ -1029,7 +1197,8 @@
completion,
thinking,
usage,
chunk,
chunk: outputChunk,
thinkingChunk,
};
}

Expand Down Expand Up @@ -1104,7 +1273,7 @@
}

// Update the streamChat method:
async *streamChat(

Check failure on line 1276 in core/llm/index.ts

View workflow job for this annotation

GitHub Actions / core-checks

Async generator method 'streamChat' has a complexity of 46. Maximum allowed is 36
_messages: ChatMessage[],
signal: AbortSignal,
options: LLMFullCompletionOptions = {},
Expand Down Expand Up @@ -1163,6 +1332,12 @@
let usage: Usage | undefined = undefined;
let citations: null | string[] = null;

// Create thinking tag extractor if custom tags are configured
const thinkingExtractor =
this.thinkingOpenTag && this.thinkingCloseTag
? new ThinkingTagExtractor(this.thinkingOpenTag, this.thinkingCloseTag)
: undefined;

try {
if (this.templateMessages) {
for await (const chunk of this._streamComplete(
Expand Down Expand Up @@ -1219,13 +1394,46 @@
}

for await (const chunk of iterable) {
const result = this.processChatChunk(chunk, interaction);
const result = this.processChatChunk(
chunk,
interaction,
thinkingExtractor,
);
completion.push(...result.completion);
thinking.push(...result.thinking);
if (result.usage !== null) {
usage = result.usage;
}
yield result.chunk;
// Yield thinking chunk first if present
if (result.thinkingChunk) {
yield result.thinkingChunk;
}
// Only yield the main chunk if it has content or tool calls
const hasToolCalls =
result.chunk.role === "assistant" &&
result.chunk.toolCalls?.length;
const hasContent =
result.chunk.content &&
(typeof result.chunk.content === "string"
? result.chunk.content.length > 0
: result.chunk.content.length > 0);

if (hasToolCalls || hasContent) {
yield result.chunk;
}
}

// Flush any remaining content from the extractor
if (thinkingExtractor) {
const flushed = thinkingExtractor.flush();
if (flushed.thinking) {
thinking.push(flushed.thinking);
yield { role: "thinking", content: flushed.thinking };
}
if (flushed.content) {
completion.push(flushed.content);
yield { role: "assistant", content: flushed.content };
}
}
} else {
if (logEnabled) {
Expand All @@ -1245,13 +1453,46 @@
signal,
completionOptions,
)) {
const result = this.processChatChunk(chunk, interaction);
const result = this.processChatChunk(
chunk,
interaction,
thinkingExtractor,
);
completion.push(...result.completion);
thinking.push(...result.thinking);
if (result.usage !== null) {
usage = result.usage;
}
yield result.chunk;
// Yield thinking chunk first if present
if (result.thinkingChunk) {
yield result.thinkingChunk;
}
// Only yield the main chunk if it has content or tool calls
const hasToolCalls =
result.chunk.role === "assistant" &&
result.chunk.toolCalls?.length;
const hasContent =
result.chunk.content &&
(typeof result.chunk.content === "string"
? result.chunk.content.length > 0
: result.chunk.content.length > 0);

if (hasToolCalls || hasContent) {
yield result.chunk;
}
}

// Flush any remaining content from the extractor
if (thinkingExtractor) {
const flushed = thinkingExtractor.flush();
if (flushed.thinking) {
thinking.push(flushed.thinking);
yield { role: "thinking", content: flushed.thinking };
}
if (flushed.content) {
completion.push(flushed.content);
yield { role: "assistant", content: flushed.content };
}
}
}
}
Expand Down
22 changes: 22 additions & 0 deletions core/llm/llms/Vllm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,28 @@ interface VllmRerankResponse {
results: VllmRerankItem[];
}

/**
* vLLM provider for Continue.
*
* vLLM supports thinking/reasoning outputs in two ways:
* 1. Via the standard `reasoning_content` field in the response (default OpenAI format)
* 2. Via custom tags in the response content (configurable)
*
* For custom thinking tag formats, you can configure `thinkingOpenTag` and `thinkingCloseTag`
* in the model options. For example:
*
* ```yaml
* models:
* - provider: vllm
* model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
* apiBase: http://localhost:8000
* thinkingOpenTag: "<think>"
* thinkingCloseTag: "</think>"
* ```
*
* See vLLM documentation for more details:
* https://docs.vllm.ai/en/latest/features/reasoning_outputs.html
*/
class Vllm extends OpenAI {
static providerName = "vllm";
constructor(options: LLMOptions) {
Expand Down
Loading
Loading