fix reasoning tokens

codelion · codelion · commit 108bcc785c70 · 2025-08-15T22:54:52.000+08:00
diff --git a/optillm.py b/optillm.py
@@ -93,6 +93,41 @@ def get_config():
         default_client = LiteLLMWrapper()
     return default_client, API_KEY
 
+def count_reasoning_tokens(text: str, tokenizer=None) -> int:
+    """
+    Count tokens within <think>...</think> tags in the given text.
+    
+    Args:
+        text: The text to analyze
+        tokenizer: Optional tokenizer instance for precise counting
+        
+    Returns:
+        Number of reasoning tokens (0 if no think tags found)
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    
+    # Extract all content within <think>...</think> tags
+    think_pattern = r'<think>(.*?)</think>'
+    matches = re.findall(think_pattern, text, re.DOTALL)
+    
+    if not matches:
+        return 0
+    
+    # Combine all thinking content
+    thinking_content = ''.join(matches)
+    
+    if tokenizer and hasattr(tokenizer, 'encode'):
+        # Use tokenizer for precise counting
+        try:
+            tokens = tokenizer.encode(thinking_content)
+            return len(tokens)
+        except Exception as e:
+            logger.warning(f"Failed to count tokens with tokenizer: {e}")
+    
+    # Fallback: rough estimation (4 chars per token on average)
+    return max(0, len(thinking_content.strip()) // 4)
+
 # Server configuration
 server_config = {
     'approach': 'none', 
@@ -678,11 +713,22 @@ def proxy():
     if stream:
         return Response(generate_streaming_response(response, model), content_type='text/event-stream')
     else:
+        # Calculate reasoning tokens from the response
+        reasoning_tokens = 0
+        if isinstance(response, str):
+            reasoning_tokens = count_reasoning_tokens(response)
+        elif isinstance(response, list) and response:
+            # For multiple responses, sum up reasoning tokens from all
+            reasoning_tokens = sum(count_reasoning_tokens(resp) for resp in response if isinstance(resp, str))
+        
         response_data = {
             'model': model,
             'choices': [],
             'usage': {
                 'completion_tokens': completion_tokens,
+                'completion_tokens_details': {
+                    'reasoning_tokens': reasoning_tokens
+                }
             }
         }
 
diff --git a/optillm/inference.py b/optillm/inference.py
@@ -18,6 +18,7 @@
 import traceback
 import platform
 import sys
+import re
 
 from optillm.cot_decoding import cot_decode
 from optillm.entropy_decoding import entropy_decode
@@ -29,6 +30,41 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+def count_reasoning_tokens(text: str, tokenizer=None) -> int:
+    """
+    Count tokens within <think>...</think> tags in the given text.
+    
+    Args:
+        text: The text to analyze
+        tokenizer: Optional tokenizer instance for precise counting
+        
+    Returns:
+        Number of reasoning tokens (0 if no think tags found)
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    
+    # Extract all content within <think>...</think> tags
+    think_pattern = r'<think>(.*?)</think>'
+    matches = re.findall(think_pattern, text, re.DOTALL)
+    
+    if not matches:
+        return 0
+    
+    # Combine all thinking content
+    thinking_content = ''.join(matches)
+    
+    if tokenizer and hasattr(tokenizer, 'encode'):
+        # Use tokenizer for precise counting
+        try:
+            tokens = tokenizer.encode(thinking_content)
+            return len(tokens)
+        except Exception as e:
+            logger.warning(f"Failed to count tokens with tokenizer: {e}")
+    
+    # Fallback: rough estimation (4 chars per token on average)
+    return max(0, len(thinking_content.strip()) // 4)
+
 # MLX Support for Apple Silicon
 try:
     import mlx.core as mx
@@ -1502,10 +1538,11 @@ def __init__(
             self.message.logprobs = logprobs
 
 class ChatCompletionUsage:
-    def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int):
+    def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int, reasoning_tokens: int = 0):
         self.prompt_tokens = prompt_tokens
         self.completion_tokens = completion_tokens
         self.total_tokens = total_tokens
+        self.reasoning_tokens = reasoning_tokens
 
 class ChatCompletion:
     def __init__(self, response_dict: Dict):
@@ -1547,7 +1584,10 @@ def model_dump(self) -> Dict:
             "usage": {
                 "prompt_tokens": self.usage.prompt_tokens,
                 "completion_tokens": self.usage.completion_tokens,
-                "total_tokens": self.usage.total_tokens
+                "total_tokens": self.usage.total_tokens,
+                "completion_tokens_details": {
+                    "reasoning_tokens": getattr(self.usage, 'reasoning_tokens', 0)
+                }
             }
         }
 
@@ -1766,15 +1806,15 @@ def create(
                                 
                                 logger.debug(f"ThinkDeeper tokens: user={user_max_tokens}, thinking={max_thinking_tokens}, adjusted={adjusted_max_tokens}")
                                 
-                                result = thinkdeeper_decode_mlx(
+                                result, reasoning_tokens = thinkdeeper_decode_mlx(
                                     pipeline.model,
                                     pipeline.tokenizer,
                                     messages,
                                     thinkdeeper_config_with_tokens
                                 )
                             else:
                                 logger.info("Using PyTorch ThinkDeeper implementation")
-                                result = thinkdeeper_decode(
+                                result, reasoning_tokens = thinkdeeper_decode(
                                     pipeline.current_model,
                                     pipeline.tokenizer,
                                     messages,
@@ -1850,6 +1890,11 @@ def create(
                         prompt_tokens = len(pipeline.tokenizer.encode(prompt))
                         completion_tokens = sum(token_counts)
 
+                    # Calculate reasoning tokens from all responses
+                    total_reasoning_tokens = 0
+                    for response in responses:
+                        total_reasoning_tokens += count_reasoning_tokens(response, pipeline.tokenizer)
+
                     # Create OpenAI-compatible response format
                     response_dict = {
                         "id": f"chatcmpl-{int(time.time()*1000)}",
@@ -1871,7 +1916,8 @@ def create(
                         "usage": {
                             "prompt_tokens": prompt_tokens,
                             "completion_tokens": completion_tokens,
-                            "total_tokens": completion_tokens + prompt_tokens
+                            "total_tokens": completion_tokens + prompt_tokens,
+                            "reasoning_tokens": total_reasoning_tokens
                         }
                     }
                     
diff --git a/optillm/thinkdeeper.py b/optillm/thinkdeeper.py
@@ -168,8 +168,8 @@ def reasoning_effort(self, messages) -> str:
         response = "".join(response_chunks)
         full_response = f"{self.config['start_think_token']}\n{self.config['prefill']}{response}"
         
-        logger.debug(f"Final response length: {len(full_response)} chars, Total thoughts: {self.thought_count}")
-        return full_response
+        logger.debug(f"Final response length: {len(full_response)} chars, Total thoughts: {self.thought_count}, Thinking tokens: {n_thinking_tokens}")
+        return full_response, n_thinking_tokens
 
 def thinkdeeper_decode(
     model: PreTrainedModel, 
@@ -192,8 +192,8 @@ def thinkdeeper_decode(
     
     try:
         processor = ThinkDeeperProcessor(config, tokenizer, model)
-        response = processor.reasoning_effort(messages)
-        return response
+        response, reasoning_tokens = processor.reasoning_effort(messages)
+        return response, reasoning_tokens
         
     except Exception as e:
         logger.error(f"Error in ThinkDeeper processing: {str(e)}")
diff --git a/optillm/thinkdeeper_mlx.py b/optillm/thinkdeeper_mlx.py
@@ -243,7 +243,8 @@ def reasoning_effort(self, messages) -> str:
         response_content = "".join(response_chunks)
         full_response = f"{self.config['start_think_token']}\n{self.config['prefill']}{response_content}"
         
-        return full_response
+        logger.debug(f"MLX Final response length: {len(full_response)} chars, Thinking tokens: {n_thinking_tokens}")
+        return full_response, n_thinking_tokens
     
     def _generate_chunk(self, prompt: str, max_tokens: int, temperature: float) -> str:
         """Generate a small chunk of text using MLX with proper sampler"""
@@ -319,8 +320,8 @@ def thinkdeeper_decode_mlx(
     
     try:
         processor = MLXThinkDeeperProcessor(config, tokenizer, model)
-        response = processor.reasoning_effort(messages)
-        return response
+        response, reasoning_tokens = processor.reasoning_effort(messages)
+        return response, reasoning_tokens
         
     except Exception as e:
         logger.error(f"Error in MLX ThinkDeeper processing: {str(e)}")