fixes

codelion · codelion · commit 5978e8f3020a · 2025-08-16T07:57:17.000+08:00
diff --git a/optillm.py b/optillm.py
@@ -108,14 +108,24 @@ def count_reasoning_tokens(text: str, tokenizer=None) -> int:
         return 0
     
     # Extract all content within <think>...</think> tags
-    think_pattern = r'<think>(.*?)</think>'
-    matches = re.findall(think_pattern, text, re.DOTALL)
+    # Handle both complete and truncated think blocks
     
-    if not matches:
-        return 0
+    # First, find all complete <think>...</think> blocks
+    complete_pattern = r'<think>(.*?)</think>'
+    complete_matches = re.findall(complete_pattern, text, re.DOTALL)
+    
+    # Then check for unclosed <think> tag (truncated response)
+    # This finds <think> that doesn't have a matching </think> after it
+    truncated_pattern = r'<think>(?!.*</think>)(.*)$'
+    truncated_match = re.search(truncated_pattern, text, re.DOTALL)
     
     # Combine all thinking content
-    thinking_content = ''.join(matches)
+    thinking_content = ''.join(complete_matches)
+    if truncated_match:
+        thinking_content += truncated_match.group(1)
+    
+    if not thinking_content:
+        return 0
     
     if tokenizer and hasattr(tokenizer, 'encode'):
         # Use tokenizer for precise counting
@@ -125,8 +135,9 @@ def count_reasoning_tokens(text: str, tokenizer=None) -> int:
         except Exception as e:
             logger.warning(f"Failed to count tokens with tokenizer: {e}")
     
-    # Fallback: rough estimation (4 chars per token on average)
-    return max(0, len(thinking_content.strip()) // 4)
+    # Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
+    content_length = len(thinking_content.strip())
+    return max(1, content_length // 4) if content_length > 0 else 0
 
 # Server configuration
 server_config = {
diff --git a/optillm/inference.py b/optillm/inference.py
@@ -45,14 +45,24 @@ def count_reasoning_tokens(text: str, tokenizer=None) -> int:
         return 0
     
     # Extract all content within <think>...</think> tags
-    think_pattern = r'<think>(.*?)</think>'
-    matches = re.findall(think_pattern, text, re.DOTALL)
+    # Handle both complete and truncated think blocks
     
-    if not matches:
-        return 0
+    # First, find all complete <think>...</think> blocks
+    complete_pattern = r'<think>(.*?)</think>'
+    complete_matches = re.findall(complete_pattern, text, re.DOTALL)
+    
+    # Then check for unclosed <think> tag (truncated response)
+    # This finds <think> that doesn't have a matching </think> after it
+    truncated_pattern = r'<think>(?!.*</think>)(.*)$'
+    truncated_match = re.search(truncated_pattern, text, re.DOTALL)
     
     # Combine all thinking content
-    thinking_content = ''.join(matches)
+    thinking_content = ''.join(complete_matches)
+    if truncated_match:
+        thinking_content += truncated_match.group(1)
+    
+    if not thinking_content:
+        return 0
     
     if tokenizer and hasattr(tokenizer, 'encode'):
         # Use tokenizer for precise counting
@@ -62,8 +72,9 @@ def count_reasoning_tokens(text: str, tokenizer=None) -> int:
         except Exception as e:
             logger.warning(f"Failed to count tokens with tokenizer: {e}")
     
-    # Fallback: rough estimation (4 chars per token on average)
-    return max(0, len(thinking_content.strip()) // 4)
+    # Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
+    content_length = len(thinking_content.strip())
+    return max(1, content_length // 4) if content_length > 0 else 0
 
 # MLX Support for Apple Silicon
 try:
diff --git a/tests/test_reasoning_simple.py b/tests/test_reasoning_simple.py
@@ -85,6 +85,63 @@ def encode(self, text):
         
         result = optillm_count(text, tokenizer)
         self.assertGreater(result, 0, "Should fallback to character estimation")
+    
+    def test_count_reasoning_tokens_truncated_response(self):
+        """Test counting tokens when response is truncated (no closing </think> tag)"""
+        # Test truncated think tag
+        truncated_text = "<think>This reasoning was cut off due to max tokens"
+        
+        result1 = optillm_count(truncated_text)
+        result2 = inference_count(truncated_text)
+        
+        self.assertGreater(result1, 0, "Should count tokens from truncated think block")
+        self.assertEqual(result1, result2, "Both functions should return same result")
+    
+    def test_count_reasoning_tokens_mixed_complete_and_truncated(self):
+        """Test with both complete and truncated think blocks"""
+        mixed_text = """
+        <think>First complete reasoning block</think>
+        Some output here
+        <think>This second block was truncated and never closed
+        """
+        
+        result = optillm_count(mixed_text)
+        self.assertGreater(result, 0, "Should count tokens from both complete and truncated blocks")
+        
+        # Should be more than just the first block alone
+        first_block_only = "<think>First complete reasoning block</think>"
+        first_result = optillm_count(first_block_only)
+        self.assertGreater(result, first_result, "Should include truncated content")
+    
+    def test_count_reasoning_tokens_no_false_positives(self):
+        """Test that we don't count think-like content that isn't actually truncated"""
+        # This should NOT be counted as truncated since there's a </think> later
+        text_with_complete_blocks = "<think>First block</think>Output<think>Second complete block</think>"
+        
+        result = optillm_count(text_with_complete_blocks)
+        
+        # Count manually - should only be the content inside the two complete blocks
+        manual_count = optillm_count("<think>First blockSecond complete block</think>")
+        self.assertEqual(result, manual_count, "Should only count complete blocks, not detect false truncation")
+    
+    def test_count_reasoning_tokens_edge_cases_truncated(self):
+        """Test edge cases with truncated responses"""
+        test_cases = [
+            ("<think>", 0),  # Just opening tag, no content
+            ("<think>a", 1),  # Minimal content
+            ("Some output <think>reasoning here", None),  # Truncated at end
+            ("<think>multi\nline\ntruncated", None),  # Multiline truncated
+        ]
+        
+        for text, expected_min in test_cases:
+            result = optillm_count(text)
+            if expected_min is not None:
+                if expected_min == 0:
+                    self.assertEqual(result, expected_min, f"Should return {expected_min} for: {text}")
+                else:
+                    self.assertGreaterEqual(result, expected_min, f"Should be at least {expected_min} for: {text}")
+            else:
+                self.assertGreater(result, 0, f"Should count truncated content for: {text}")
 
 
 class TestInferenceStructures(unittest.TestCase):