Skip to content

Commit 382c7e4

Browse files
committed
add tests
1 parent 108bcc7 commit 382c7e4

File tree

3 files changed

+641
-0
lines changed

3 files changed

+641
-0
lines changed

tests/test_cases.json

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,30 @@
4343
"name": "Simple Math Problem",
4444
"system_prompt": "You are a helpful assistant.",
4545
"query": "What is 2 + 2?"
46+
},
47+
{
48+
"name": "Reasoning Token Test - Complex Logic",
49+
"system_prompt": "You are an AI assistant that thinks step by step. Use <think> tags to show your reasoning process.",
50+
"query": "Three friends Alice, Bob, and Charlie each have a different number of marbles. Alice has twice as many as Bob. Charlie has 3 more than Alice. Together they have 23 marbles. How many marbles does each person have?"
51+
},
52+
{
53+
"name": "Reasoning Token Test - Strategic Thinking",
54+
"system_prompt": "Think carefully before responding. Show your work using thinking tags.",
55+
"query": "You're playing a game where you can choose door A or door B. Behind one door is a prize worth $1000, behind the other is nothing. You know that if the prize is behind door A, there's a 70% chance a light above door A will flash. If the prize is behind door B, there's a 30% chance the light above door A will flash. The light above door A is flashing. Which door should you choose?"
56+
},
57+
{
58+
"name": "Reasoning Token Test - Multi-Step Problem",
59+
"system_prompt": "Please think through this problem step by step, showing your reasoning.",
60+
"query": "A bakery sells cupcakes in boxes of 6 and cookies in boxes of 8. If someone buys the same number of cupcakes and cookies, what is the smallest number of each type of baked good they could buy? Show all your work."
61+
},
62+
{
63+
"name": "Reasoning Token Test - Counter-intuitive",
64+
"system_prompt": "This problem might seem simple but requires careful analysis. Think it through.",
65+
"query": "In a family with two children, you know that at least one of them is a boy. What is the probability that both children are boys? Explain your reasoning carefully."
66+
},
67+
{
68+
"name": "Reasoning Token Test - Algorithm Design",
69+
"system_prompt": "Think through the algorithm design process step by step.",
70+
"query": "Design an efficient algorithm to find the second largest element in an unsorted array. Explain your approach, analyze the time complexity, and provide pseudocode."
4671
}
4772
]
Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Integration tests for reasoning token functionality with OptILLM API
4+
"""
5+
6+
import pytest
7+
import sys
8+
import os
9+
import json
10+
from unittest.mock import Mock, MagicMock, patch
11+
from typing import Dict, Any
12+
13+
# Add parent directory to path to import optillm modules
14+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15+
16+
from optillm import app, count_reasoning_tokens
17+
18+
19+
class MockOpenAIClient:
20+
"""Enhanced mock client that can generate responses with think tags"""
21+
22+
def __init__(self, include_thinking=True):
23+
self.include_thinking = include_thinking
24+
self.chat = self.Chat(include_thinking)
25+
26+
class Chat:
27+
def __init__(self, include_thinking):
28+
self.completions = self.Completions(include_thinking)
29+
self.include_thinking = include_thinking
30+
31+
class Completions:
32+
def __init__(self, include_thinking):
33+
self.include_thinking = include_thinking
34+
35+
def create(self, **kwargs):
36+
messages = kwargs.get('messages', [])
37+
n = kwargs.get('n', 1)
38+
39+
# Generate response based on the query content
40+
if self.include_thinking and any('think' in str(msg).lower() for msg in messages):
41+
# Generate response with thinking
42+
content = "<think>Let me work through this step by step. First, I need to understand what's being asked. This requires careful analysis.</think>\n\nBased on my analysis, the answer is 42."
43+
else:
44+
# Simple response without thinking
45+
content = "The answer is 42."
46+
47+
class MockChoice:
48+
def __init__(self, content, index=0):
49+
self.message = type('Message', (), {'content': content})()
50+
self.index = index
51+
self.finish_reason = 'stop'
52+
53+
class MockUsage:
54+
completion_tokens = 50
55+
total_tokens = 75
56+
prompt_tokens = 25
57+
58+
class MockResponse:
59+
def __init__(self, choices, usage):
60+
self.choices = choices
61+
self.usage = usage
62+
63+
def model_dump(self):
64+
return {
65+
'choices': [
66+
{
67+
'index': choice.index,
68+
'message': {'content': choice.message.content},
69+
'finish_reason': choice.finish_reason
70+
} for choice in self.choices
71+
],
72+
'usage': {
73+
'completion_tokens': self.usage.completion_tokens,
74+
'total_tokens': self.usage.total_tokens,
75+
'prompt_tokens': self.usage.prompt_tokens
76+
}
77+
}
78+
79+
# Create multiple choices if n > 1
80+
choices = []
81+
for i in range(n):
82+
if self.include_thinking:
83+
varied_content = f"<think>Thinking process {i+1}: Let me analyze this carefully...</think>\n\nAnswer {i+1}: The result is {42 + i}."
84+
else:
85+
varied_content = f"Answer {i+1}: The result is {42 + i}."
86+
choices.append(MockChoice(varied_content, i))
87+
88+
return MockResponse(choices, MockUsage())
89+
90+
91+
class TestReasoningTokensAPIIntegration:
92+
"""Test reasoning tokens in API responses"""
93+
94+
def setup_method(self):
95+
"""Setup test client"""
96+
app.config['TESTING'] = True
97+
self.client = app.test_client()
98+
99+
# Mock the get_config function to return our mock client
100+
self.mock_client = MockOpenAIClient(include_thinking=True)
101+
102+
@patch('optillm.get_config')
103+
def test_api_response_includes_reasoning_tokens(self, mock_get_config):
104+
"""Test that API responses include reasoning_tokens in completion_tokens_details"""
105+
mock_get_config.return_value = (self.mock_client, "test-key")
106+
107+
# Test request with none approach (direct proxy)
108+
response = self.client.post('/v1/chat/completions',
109+
json={
110+
'model': 'none-gpt-4o-mini',
111+
'messages': [
112+
{'role': 'user', 'content': 'Please think about this problem step by step.'}
113+
]
114+
},
115+
headers={'Authorization': 'Bearer test-key'})
116+
117+
assert response.status_code == 200
118+
data = response.get_json()
119+
120+
# Check response structure
121+
assert 'usage' in data
122+
assert 'completion_tokens_details' in data['usage']
123+
assert 'reasoning_tokens' in data['usage']['completion_tokens_details']
124+
125+
# Should have reasoning tokens since mock returns thinking content
126+
reasoning_tokens = data['usage']['completion_tokens_details']['reasoning_tokens']
127+
assert reasoning_tokens > 0
128+
129+
@patch('optillm.get_config')
130+
def test_api_response_no_reasoning_tokens(self, mock_get_config):
131+
"""Test that responses without think tags have 0 reasoning tokens"""
132+
mock_client_no_thinking = MockOpenAIClient(include_thinking=False)
133+
mock_get_config.return_value = (mock_client_no_thinking, "test-key")
134+
135+
response = self.client.post('/v1/chat/completions',
136+
json={
137+
'model': 'none-gpt-4o-mini',
138+
'messages': [
139+
{'role': 'user', 'content': 'What is 2+2?'}
140+
]
141+
},
142+
headers={'Authorization': 'Bearer test-key'})
143+
144+
assert response.status_code == 200
145+
data = response.get_json()
146+
147+
# Should have 0 reasoning tokens
148+
reasoning_tokens = data['usage']['completion_tokens_details']['reasoning_tokens']
149+
assert reasoning_tokens == 0
150+
151+
@patch('optillm.get_config')
152+
def test_multiple_responses_reasoning_tokens(self, mock_get_config):
153+
"""Test reasoning tokens calculation with n > 1"""
154+
mock_get_config.return_value = (self.mock_client, "test-key")
155+
156+
response = self.client.post('/v1/chat/completions',
157+
json={
158+
'model': 'none-gpt-4o-mini',
159+
'messages': [
160+
{'role': 'user', 'content': 'Think through this problem.'}
161+
],
162+
'n': 3
163+
},
164+
headers={'Authorization': 'Bearer test-key'})
165+
166+
assert response.status_code == 200
167+
data = response.get_json()
168+
169+
# Should have 3 choices
170+
assert len(data['choices']) == 3
171+
172+
# Should sum reasoning tokens from all responses
173+
reasoning_tokens = data['usage']['completion_tokens_details']['reasoning_tokens']
174+
assert reasoning_tokens > 0
175+
176+
# Each response should have thinking content, so total should be > individual
177+
# (This is a rough check since we're mocking)
178+
assert reasoning_tokens >= 10 # Reasonable minimum
179+
180+
def test_reasoning_tokens_calculation_accuracy(self):
181+
"""Test that reasoning token calculation is accurate"""
182+
# Test direct function with known content
183+
test_content = "<think>This is a test thinking block with exactly twenty words to verify the token counting accuracy works properly.</think>Result: 42"
184+
185+
expected_thinking = "This is a test thinking block with exactly twenty words to verify the token counting accuracy works properly."
186+
tokens = count_reasoning_tokens(test_content)
187+
188+
# With fallback estimation (4 chars per token)
189+
expected_tokens = len(expected_thinking) // 4
190+
assert tokens == expected_tokens
191+
192+
@patch('optillm.get_config')
193+
def test_error_handling_invalid_response(self, mock_get_config):
194+
"""Test error handling when response processing fails"""
195+
# Mock client that returns malformed response
196+
mock_client = Mock()
197+
mock_client.chat.completions.create.side_effect = Exception("API Error")
198+
mock_get_config.return_value = (mock_client, "test-key")
199+
200+
response = self.client.post('/v1/chat/completions',
201+
json={
202+
'model': 'none-gpt-4o-mini',
203+
'messages': [{'role': 'user', 'content': 'test'}]
204+
},
205+
headers={'Authorization': 'Bearer test-key'})
206+
207+
assert response.status_code == 500
208+
data = response.get_json()
209+
assert 'error' in data
210+
211+
212+
class TestApproachIntegration:
213+
"""Test reasoning tokens with different OptILLM approaches"""
214+
215+
def setup_method(self):
216+
"""Setup test client"""
217+
app.config['TESTING'] = True
218+
self.client = app.test_client()
219+
220+
def test_reasoning_tokens_with_mock_approach(self):
221+
"""Test reasoning tokens with a mock approach that generates thinking"""
222+
223+
# Create a simple test that doesn't require external API calls
224+
test_text_with_thinking = """
225+
<think>
226+
I need to analyze this problem step by step:
227+
1. First, understand the requirements
228+
2. Then, consider the constraints
229+
3. Finally, provide a solution
230+
231+
This seems straightforward but requires careful thought.
232+
</think>
233+
234+
Based on my analysis, the answer is: 42
235+
"""
236+
237+
# Test the reasoning token extraction directly
238+
reasoning_tokens = count_reasoning_tokens(test_text_with_thinking)
239+
assert reasoning_tokens > 0
240+
241+
# The thinking content should be properly extracted
242+
thinking_content = """
243+
I need to analyze this problem step by step:
244+
1. First, understand the requirements
245+
2. Then, consider the constraints
246+
3. Finally, provide a solution
247+
248+
This seems straightforward but requires careful thought.
249+
"""
250+
251+
# Rough token estimate (fallback method)
252+
expected_tokens = len(thinking_content.strip()) // 4
253+
assert abs(reasoning_tokens - expected_tokens) <= 5 # Allow small variance
254+
255+
def test_complex_thinking_patterns(self):
256+
"""Test various thinking patterns that approaches might generate"""
257+
258+
test_cases = [
259+
# Single block
260+
"<think>Simple thinking</think>Answer: Yes",
261+
262+
# Multiple blocks
263+
"<think>First thought</think>Intermediate result<think>Second thought</think>Final answer",
264+
265+
# Nested structure (should extract outer)
266+
"<think>Outer<think>inner</think>more outer</think>Result",
267+
268+
# With code blocks inside thinking
269+
"<think>Let me write some code:\n```python\nx = 1\n```\nThat should work.</think>Code solution provided",
270+
271+
# With mathematical notation
272+
"<think>If x = 2, then x² = 4, so the equation becomes 4 + 3 = 7</think>The result is 7"
273+
]
274+
275+
for i, test_case in enumerate(test_cases):
276+
tokens = count_reasoning_tokens(test_case)
277+
assert tokens > 0, f"Test case {i+1} should have reasoning tokens: {test_case}"
278+
279+
def test_backward_compatibility(self):
280+
"""Test that non-thinking responses work normally"""
281+
normal_responses = [
282+
"This is a normal response without any thinking.",
283+
"The answer is 42.",
284+
"I can help you with that. Here's the solution: x = 5",
285+
"", # Empty response
286+
]
287+
288+
for response in normal_responses:
289+
tokens = count_reasoning_tokens(response)
290+
assert tokens == 0, f"Normal response should have 0 reasoning tokens: {response}"
291+
292+
293+
class TestStreamingIntegration:
294+
"""Test reasoning tokens with streaming responses"""
295+
296+
def setup_method(self):
297+
"""Setup test client"""
298+
app.config['TESTING'] = True
299+
self.client = app.test_client()
300+
301+
@patch('optillm.get_config')
302+
def test_streaming_response_format(self, mock_get_config):
303+
"""Test that streaming responses don't break with reasoning tokens"""
304+
mock_client = MockOpenAIClient(include_thinking=True)
305+
mock_get_config.return_value = (mock_client, "test-key")
306+
307+
# Note: Streaming responses in OptILLM don't include reasoning token details
308+
# in the same way as non-streaming, but we test that it doesn't break
309+
response = self.client.post('/v1/chat/completions',
310+
json={
311+
'model': 'none-gpt-4o-mini',
312+
'messages': [
313+
{'role': 'user', 'content': 'Think about this'}
314+
],
315+
'stream': True
316+
},
317+
headers={'Authorization': 'Bearer test-key'})
318+
319+
# Streaming should work without errors
320+
assert response.status_code == 200
321+
assert response.content_type == 'text/event-stream; charset=utf-8'
322+
323+
324+
if __name__ == "__main__":
325+
# Run tests if pytest not available
326+
import traceback
327+
328+
test_classes = [
329+
TestReasoningTokensAPIIntegration,
330+
TestApproachIntegration,
331+
TestStreamingIntegration
332+
]
333+
334+
for test_class in test_classes:
335+
print(f"\n=== Running {test_class.__name__} ===")
336+
instance = test_class()
337+
instance.setup_method()
338+
339+
for method_name in dir(instance):
340+
if method_name.startswith('test_'):
341+
try:
342+
print(f"Running {method_name}...", end=' ')
343+
getattr(instance, method_name)()
344+
print("✅ PASSED")
345+
except Exception as e:
346+
print(f"❌ FAILED: {e}")
347+
traceback.print_exc()
348+
349+
print("\n=== Integration Tests Complete ===")

0 commit comments

Comments
 (0)