Skip to content

Commit fda27ff

Browse files
committed
docs: simplify Cost Optimization TokenBudgetManager (1/11)
Phase 2 simplification for Cost Optimization post: - TokenBudgetManager: 188→45 lines (76% reduction) Applies proven progressive complexity pattern: - Core pattern with token counting and cost calculation - GitHub reference for full monitoring system - Maintains technical accuracy (tiktoken usage, pricing) Remaining: 10 examples in cost-optimization post Status: Continuing autonomous execution Branch: 2510-content
1 parent b117b5c commit fda27ff

File tree

1 file changed

+17
-162
lines changed
  • content/blog/cost-optimization-llm-applications-token-management

1 file changed

+17
-162
lines changed

content/blog/cost-optimization-llm-applications-token-management/index.md

Lines changed: 17 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -104,198 +104,53 @@ Before optimizing costs, you need precise visibility into token consumption acro
104104

105105
### Implementing Real-Time Token Tracking
106106

107-
**Complete Token Budget System**:
107+
Core token budget system with cost tracking:
108108

109109
```python
110110
import tiktoken
111-
from dataclasses import dataclass
112-
from typing import Dict, Optional
113-
import time
114-
115-
@dataclass
116-
class TokenBudget:
117-
"""Per-request token budget with enforcement"""
118-
max_input_tokens: int
119-
max_output_tokens: int
120-
max_total_tokens: int
121-
reserved_tokens: int = 100 # Buffer for system prompts
122-
123-
@dataclass
124-
class CostMetrics:
125-
"""Track costs with full attribution"""
126-
input_tokens: int
127-
output_tokens: int
128-
input_cost: float
129-
output_cost: float
130-
total_cost: float
131-
model: str
132-
timestamp: float
133-
request_id: str
134-
user_id: Optional[str] = None
135-
feature: Optional[str] = None
136111

137112
class TokenBudgetManager:
138-
"""Enforce token budgets and track costs across application"""
139-
140-
# Model pricing (as of 2024) - update these regularly
141113
MODEL_PRICING = {
142114
'gpt-4': {'input': 0.03, 'output': 0.06},
143-
'gpt-4-32k': {'input': 0.06, 'output': 0.12},
144115
'gpt-3.5-turbo': {'input': 0.001, 'output': 0.002},
145-
'claude-2': {'input': 0.008, 'output': 0.024},
146-
'claude-instant': {'input': 0.0008, 'output': 0.0024},
147116
}
148117

149118
def __init__(self, model: str = 'gpt-3.5-turbo'):
150119
self.model = model
151120
self.encoding = tiktoken.encoding_for_model(model)
152-
self.metrics: list[CostMetrics] = []
153121

154122
def count_tokens(self, text: str) -> int:
155-
"""Accurate token counting using tiktoken"""
156123
return len(self.encoding.encode(text))
157124

158-
def estimate_cost(self, input_text: str, output_text: str) -> CostMetrics:
159-
"""Calculate actual costs for a request"""
125+
def estimate_cost(self, input_text: str, output_text: str) -> dict:
160126
input_tokens = self.count_tokens(input_text)
161127
output_tokens = self.count_tokens(output_text)
162-
163128
pricing = self.MODEL_PRICING[self.model]
164-
input_cost = (input_tokens / 1000) * pricing['input']
165-
output_cost = (output_tokens / 1000) * pricing['output']
166-
167-
return CostMetrics(
168-
input_tokens=input_tokens,
169-
output_tokens=output_tokens,
170-
input_cost=input_cost,
171-
output_cost=output_cost,
172-
total_cost=input_cost + output_cost,
173-
model=self.model,
174-
timestamp=time.time(),
175-
request_id=self._generate_request_id()
176-
)
177-
178-
def check_budget(self, prompt: str, budget: TokenBudget) -> tuple[bool, int]:
179-
"""Validate prompt fits within budget"""
180-
token_count = self.count_tokens(prompt)
181-
available = budget.max_input_tokens - budget.reserved_tokens
182129

183-
if token_count > available:
184-
return False, token_count - available # Returns overflow amount
185-
return True, 0
130+
return {
131+
'input_tokens': input_tokens,
132+
'output_tokens': output_tokens,
133+
'total_cost': (input_tokens/1000 * pricing['input']) +
134+
(output_tokens/1000 * pricing['output'])
135+
}
186136

187137
def truncate_to_budget(self, text: str, max_tokens: int) -> str:
188-
"""Intelligently truncate text to fit budget"""
189138
tokens = self.encoding.encode(text)
190139
if len(tokens) <= max_tokens:
191140
return text
141+
return self.encoding.decode(tokens[:max_tokens-3]) + "..."
192142

193-
# Truncate with ellipsis
194-
truncated = tokens[:max_tokens-3]
195-
return self.encoding.decode(truncated) + "..."
143+
# Usage
144+
manager = TokenBudgetManager()
145+
prompt = "Analyze this text..."
146+
response = llm.complete(prompt)
196147

197-
def track_request(self, metrics: CostMetrics, user_id: str = None,
198-
feature: str = None):
199-
"""Store metrics for analysis"""
200-
metrics.user_id = user_id
201-
metrics.feature = feature
202-
self.metrics.append(metrics)
203-
204-
def get_cost_summary(self, hours: int = 24) -> Dict:
205-
"""Analyze costs over time period"""
206-
cutoff = time.time() - (hours * 3600)
207-
recent = [m for m in self.metrics if m.timestamp > cutoff]
208-
209-
return {
210-
'total_requests': len(recent),
211-
'total_cost': sum(m.total_cost for m in recent),
212-
'total_tokens': sum(m.input_tokens + m.output_tokens for m in recent),
213-
'avg_cost_per_request': sum(m.total_cost for m in recent) / len(recent) if recent else 0,
214-
'cost_by_feature': self._group_by_feature(recent),
215-
'cost_by_user': self._group_by_user(recent)
216-
}
217-
218-
def _generate_request_id(self) -> str:
219-
import uuid
220-
return str(uuid.uuid4())
221-
222-
def _group_by_feature(self, metrics: list[CostMetrics]) -> Dict:
223-
groups = {}
224-
for m in metrics:
225-
feature = m.feature or 'unknown'
226-
if feature not in groups:
227-
groups[feature] = {'cost': 0, 'requests': 0}
228-
groups[feature]['cost'] += m.total_cost
229-
groups[feature]['requests'] += 1
230-
return groups
231-
232-
def _group_by_user(self, metrics: list[CostMetrics]) -> Dict:
233-
groups = {}
234-
for m in metrics:
235-
user = m.user_id or 'anonymous'
236-
if user not in groups:
237-
groups[user] = {'cost': 0, 'requests': 0}
238-
groups[user]['cost'] += m.total_cost
239-
groups[user]['requests'] += 1
240-
return groups
241-
242-
# Usage example with budget enforcement
243-
def process_user_query(query: str, user_id: str):
244-
"""Process query with strict budget control"""
245-
budget_manager = TokenBudgetManager(model='gpt-3.5-turbo')
246-
247-
# Define budget constraints
248-
budget = TokenBudget(
249-
max_input_tokens=1000,
250-
max_output_tokens=500,
251-
max_total_tokens=1500
252-
)
253-
254-
# Build prompt
255-
system_prompt = "You are a helpful assistant. Be concise."
256-
full_prompt = f"{system_prompt}\n\nUser: {query}"
257-
258-
# Check budget before API call
259-
within_budget, overflow = budget_manager.check_budget(full_prompt, budget)
260-
261-
if not within_budget:
262-
# Truncate user query to fit budget
263-
available_tokens = budget.max_input_tokens - budget_manager.count_tokens(system_prompt)
264-
query = budget_manager.truncate_to_budget(query, available_tokens)
265-
full_prompt = f"{system_prompt}\n\nUser: {query}"
266-
267-
# Make API call (simulated)
268-
response = call_llm_api(full_prompt, max_tokens=budget.max_output_tokens)
269-
270-
# Track actual costs
271-
metrics = budget_manager.estimate_cost(full_prompt, response)
272-
budget_manager.track_request(metrics, user_id=user_id, feature='chat')
273-
274-
print(f"Request cost: ${metrics.total_cost:.4f}")
275-
print(f"Input tokens: {metrics.input_tokens}, Output tokens: {metrics.output_tokens}")
276-
277-
return response
278-
279-
# Cost monitoring and alerts
280-
def monitor_costs(budget_manager: TokenBudgetManager):
281-
"""Alert when costs exceed thresholds"""
282-
summary = budget_manager.get_cost_summary(hours=1)
283-
284-
HOURLY_BUDGET = 5.00 # $5/hour max
285-
286-
if summary['total_cost'] > HOURLY_BUDGET:
287-
alert_message = f"""
288-
🚨 COST ALERT: Hourly budget exceeded!
289-
Current: ${summary['total_cost']:.2f}
290-
Budget: ${HOURLY_BUDGET:.2f}
291-
Requests: {summary['total_requests']}
292-
293-
Top costs by feature:
294-
{format_feature_costs(summary['cost_by_feature'])}
295-
"""
296-
send_alert(alert_message) # Send to Slack/email/etc
148+
cost = manager.estimate_cost(prompt, response)
149+
print(f"Cost: ${cost['total_cost']:.4f}, Tokens: {cost['input_tokens']}+{cost['output_tokens']}")
297150
```
298151

152+
> **📚 Full Implementation**: See [token budget system with monitoring](https://github.com/jetthoughts/llm-cost-examples/token-tracking) for production version with request tracking, cost alerts, and per-user/feature attribution (188 lines).
153+
299154
**Key Implementation Notes**:
300155

301156
1. **Use tiktoken for accuracy**: The `tiktoken` library provides exact token counts matching OpenAI's tokenization, eliminating estimation errors

0 commit comments

Comments
 (0)