continuedev · AyRickk · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/core/index.d.ts b/core/index.d.ts
@@ -687,6 +687,12 @@ export interface LLMOptions {
 
   sourceFile?: string;
   isFromAutoDetect?: boolean;
+
+  // Thinking output format options
+  // These allow configuring custom tags to extract thinking content from the response
+  // For example, vLLM can use <think>...</think> tags instead of the standard reasoning_content field
+  thinkingOpenTag?: string;
+  thinkingCloseTag?: string;
 }
 
 type RequireAtLeastOne<T, Keys extends keyof T = keyof T> = Pick<

@@ -84,6 +84,134 @@
 
 type InteractionStatus = "in_progress" | "success" | "error" | "cancelled";
 
+/**
+ * Helper class to extract thinking content from custom tags during streaming.
+ * This is used for providers like vLLM that support custom thinking output formats.
+ */
+export class ThinkingTagExtractor {
+  private buffer: string = "";
+  private inThinkingBlock: boolean = false;
+  private readonly openTag: string;
+  private readonly closeTag: string;
+
+  constructor(openTag: string, closeTag: string) {
+    this.openTag = openTag;
+    this.closeTag = closeTag;
+  }
+
+  /**
+   * Process a chunk of text and extract thinking/regular content.
+   * Returns an object with the thinking content and regular content that should be yielded.
+   */
+  process(text: string): {
+    thinking: string;
+    content: string;
+  } {
+    this.buffer += text;
+
+    let thinking = "";
+    let content = "";
+
+    while (this.buffer.length > 0) {
+      if (this.inThinkingBlock) {
+        // Look for closing tag
+        const closeIndex = this.buffer.indexOf(this.closeTag);
+        if (closeIndex !== -1) {
+          // Found closing tag - extract thinking content up to it
+          thinking += this.buffer.substring(0, closeIndex);
+          this.buffer = this.buffer.substring(
+            closeIndex + this.closeTag.length,
+          );
+          this.inThinkingBlock = false;
+        } else {
+          // No closing tag yet - check if we might have a partial closing tag at the end
+          const partialMatchLength = this.getPartialMatchLength(
+            this.buffer,
+            this.closeTag,
+          );
+          if (partialMatchLength > 0) {
+            // Keep the potential partial match in the buffer
+            thinking += this.buffer.substring(
+              0,
+              this.buffer.length - partialMatchLength,
+            );
+            this.buffer = this.buffer.substring(
+              this.buffer.length - partialMatchLength,
+            );
+          } else {
+            // No partial match - all content is thinking
+            thinking += this.buffer;
+            this.buffer = "";
+          }
+          break;
+        }
+      } else {
+        // Not in thinking block - look for opening tag
+        const openIndex = this.buffer.indexOf(this.openTag);
+        if (openIndex !== -1) {
+          // Found opening tag
+          content += this.buffer.substring(0, openIndex);
+          this.buffer = this.buffer.substring(openIndex + this.openTag.length);
+          this.inThinkingBlock = true;
+        } else {
+          // No opening tag - check if we might have a partial opening tag at the end
+          const partialMatchLength = this.getPartialMatchLength(
+            this.buffer,
+            this.openTag,
+          );
+          if (partialMatchLength > 0) {
+            // Keep the potential partial match in the buffer
+            content += this.buffer.substring(
+              0,
+              this.buffer.length - partialMatchLength,
+            );
+            this.buffer = this.buffer.substring(
+              this.buffer.length - partialMatchLength,
+            );
+          } else {
+            // No partial match - all content is regular content
+            content += this.buffer;
+            this.buffer = "";
+          }
+          break;
+        }
+      }
+    }
+
+    return { thinking, content };
+  }
+
+  /**
+   * Flush any remaining content in the buffer.
+   * Call this when the stream ends.
+   */
+  flush(): {
+    thinking: string;
+    content: string;
+  } {
+    const result = {
+      thinking: this.inThinkingBlock ? this.buffer : "",
+      content: this.inThinkingBlock ? "" : this.buffer,
+    };
+    this.buffer = "";
+    this.inThinkingBlock = false;
+    return result;
+  }
+
+  /**
+   * Check if the end of the text could be the start of the tag.
+   * Returns the length of the partial match, or 0 if no match.
+   */
+  private getPartialMatchLength(text: string, tag: string): number {
+    for (let i = 1; i < tag.length && i <= text.length; i++) {
+      if (text.slice(-i) === tag.slice(0, i)) {
+        return i;
+      }
+    }
+    return 0;
+  }
+}
+
 export abstract class BaseLLM implements ILLM {
   static providerName: string;
   static defaultOptions: Partial<LLMOptions> | undefined = undefined;
@@ -196,6 +324,10 @@
 
   isFromAutoDetect?: boolean;
 
+  // Thinking output format options
+  thinkingOpenTag?: string;
+  thinkingCloseTag?: string;
+
   lastRequestId: string | undefined;
 
   private _llmOptions: LLMOptions;
@@ -303,6 +435,10 @@
     this.autocompleteOptions = options.autocompleteOptions;
     this.sourceFile = options.sourceFile;
     this.isFromAutoDetect = options.isFromAutoDetect;
+
+    // Thinking output format options
+    this.thinkingOpenTag = options.thinkingOpenTag;
+    this.thinkingCloseTag = options.thinkingCloseTag;
   }
 
   get contextLength() {
@@ -1000,18 +1136,50 @@
   private processChatChunk(
     chunk: ChatMessage,
     interaction: ILLMInteractionLog | undefined,
+    thinkingExtractor?: ThinkingTagExtractor,
   ): {
     completion: string[];
     thinking: string[];
     usage: Usage | null;
     chunk: ChatMessage;
+    thinkingChunk?: ChatMessage;
   } {
     const completion: string[] = [];
     const thinking: string[] = [];
     let usage: Usage | null = null;
+    let outputChunk = chunk;
+    let thinkingChunk: ChatMessage | undefined;
 
     if (chunk.role === "assistant") {
-      completion.push(this._formatChatMessage(chunk));
+      // If we have a thinking extractor, process the content through it
+      if (thinkingExtractor && typeof chunk.content === "string") {
+        const extracted = thinkingExtractor.process(chunk.content);
+
+        if (extracted.thinking) {
+          thinking.push(extracted.thinking);
+          thinkingChunk = {
+            role: "thinking",
+            content: extracted.thinking,
+          };
+        }
+
+        if (extracted.content) {
+          const processedChunk: ChatMessage = {
+            ...chunk,
+            content: extracted.content,
+          };
+          completion.push(this._formatChatMessage(processedChunk));
+          outputChunk = processedChunk;
+        } else {
+          // No regular content in this chunk, just thinking
+          outputChunk = {
+            ...chunk,
+            content: "",
+          };
+        }
+      } else {
+        completion.push(this._formatChatMessage(chunk));
+      }
     } else if (chunk.role === "thinking" && typeof chunk.content === "string") {
       thinking.push(chunk.content);
     }
@@ -1029,7 +1197,8 @@
       completion,
       thinking,
       usage,
-      chunk,
+      chunk: outputChunk,
+      thinkingChunk,
     };
   }
 
@@ -1104,7 +1273,7 @@
  }

  // Update the streamChat method:
  async *streamChat(
    _messages: ChatMessage[],
    signal: AbortSignal,
    options: LLMFullCompletionOptions = {},
@@ -1163,6 +1332,12 @@
     let usage: Usage | undefined = undefined;
     let citations: null | string[] = null;
 
+    // Create thinking tag extractor if custom tags are configured
+    const thinkingExtractor =
+      this.thinkingOpenTag && this.thinkingCloseTag
+        ? new ThinkingTagExtractor(this.thinkingOpenTag, this.thinkingCloseTag)
+        : undefined;
+
     try {
       if (this.templateMessages) {
         for await (const chunk of this._streamComplete(
@@ -1219,13 +1394,46 @@
           }
 
           for await (const chunk of iterable) {
-            const result = this.processChatChunk(chunk, interaction);
+            const result = this.processChatChunk(
+              chunk,
+              interaction,
+              thinkingExtractor,
+            );
             completion.push(...result.completion);
             thinking.push(...result.thinking);
             if (result.usage !== null) {
               usage = result.usage;
             }
-            yield result.chunk;
+            // Yield thinking chunk first if present
+            if (result.thinkingChunk) {
+              yield result.thinkingChunk;
+            }
+            // Only yield the main chunk if it has content or tool calls
+            const hasToolCalls =
+              result.chunk.role === "assistant" &&
+              result.chunk.toolCalls?.length;
+            const hasContent =
+              result.chunk.content &&
+              (typeof result.chunk.content === "string"
+                ? result.chunk.content.length > 0
+                : result.chunk.content.length > 0);
+
+            if (hasToolCalls || hasContent) {
+              yield result.chunk;
+            }
+          }
+
+          // Flush any remaining content from the extractor
+          if (thinkingExtractor) {
+            const flushed = thinkingExtractor.flush();
+            if (flushed.thinking) {
+              thinking.push(flushed.thinking);
+              yield { role: "thinking", content: flushed.thinking };
+            }
+            if (flushed.content) {
+              completion.push(flushed.content);
+              yield { role: "assistant", content: flushed.content };
+            }
           }
         } else {
           if (logEnabled) {
@@ -1245,13 +1453,46 @@
             signal,
             completionOptions,
           )) {
-            const result = this.processChatChunk(chunk, interaction);
+            const result = this.processChatChunk(
+              chunk,
+              interaction,
+              thinkingExtractor,
+            );
             completion.push(...result.completion);
             thinking.push(...result.thinking);
             if (result.usage !== null) {
               usage = result.usage;
             }
-            yield result.chunk;
+            // Yield thinking chunk first if present
+            if (result.thinkingChunk) {
+              yield result.thinkingChunk;
+            }
+            // Only yield the main chunk if it has content or tool calls
+            const hasToolCalls =
+              result.chunk.role === "assistant" &&
+              result.chunk.toolCalls?.length;
+            const hasContent =
+              result.chunk.content &&
+              (typeof result.chunk.content === "string"
+                ? result.chunk.content.length > 0
+                : result.chunk.content.length > 0);
+
+            if (hasToolCalls || hasContent) {
+              yield result.chunk;
+            }
+          }
+
+          // Flush any remaining content from the extractor
+          if (thinkingExtractor) {
+            const flushed = thinkingExtractor.flush();
+            if (flushed.thinking) {
+              thinking.push(flushed.thinking);
+              yield { role: "thinking", content: flushed.thinking };
+            }
+            if (flushed.content) {
+              completion.push(flushed.content);
+              yield { role: "assistant", content: flushed.content };
+            }
           }
         }
       }

@@ -20,6 +20,28 @@ interface VllmRerankResponse {
   results: VllmRerankItem[];
 }
 
+/**
+ * vLLM provider for Continue.
+ *
+ * vLLM supports thinking/reasoning outputs in two ways:
+ * 1. Via the standard `reasoning_content` field in the response (default OpenAI format)
+ * 2. Via custom tags in the response content (configurable)
+ *
+ * For custom thinking tag formats, you can configure `thinkingOpenTag` and `thinkingCloseTag`
+ * in the model options. For example:
+ *
+ * ```yaml
+ * models:
+ *   - provider: vllm
+ *     model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+ *     apiBase: http://localhost:8000
+ *     thinkingOpenTag: "<think>"
+ *     thinkingCloseTag: "</think>"
+ * ```
+ *
+ * See vLLM documentation for more details:
+ * https://docs.vllm.ai/en/latest/features/reasoning_outputs.html
+ */
 class Vllm extends OpenAI {
   static providerName = "vllm";
   constructor(options: LLMOptions) {