@@ -104,198 +104,53 @@ Before optimizing costs, you need precise visibility into token consumption acro
104104
105105### Implementing Real-Time Token Tracking
106106
107- ** Complete Token Budget System ** :
107+ Core token budget system with cost tracking :
108108
109109``` python
110110import tiktoken
111- from dataclasses import dataclass
112- from typing import Dict, Optional
113- import time
114-
115- @dataclass
116- class TokenBudget :
117- """ Per-request token budget with enforcement"""
118- max_input_tokens: int
119- max_output_tokens: int
120- max_total_tokens: int
121- reserved_tokens: int = 100 # Buffer for system prompts
122-
123- @dataclass
124- class CostMetrics :
125- """ Track costs with full attribution"""
126- input_tokens: int
127- output_tokens: int
128- input_cost: float
129- output_cost: float
130- total_cost: float
131- model: str
132- timestamp: float
133- request_id: str
134- user_id: Optional[str ] = None
135- feature: Optional[str ] = None
136111
137112class TokenBudgetManager :
138- """ Enforce token budgets and track costs across application"""
139-
140- # Model pricing (as of 2024) - update these regularly
141113 MODEL_PRICING = {
142114 ' gpt-4' : {' input' : 0.03 , ' output' : 0.06 },
143- ' gpt-4-32k' : {' input' : 0.06 , ' output' : 0.12 },
144115 ' gpt-3.5-turbo' : {' input' : 0.001 , ' output' : 0.002 },
145- ' claude-2' : {' input' : 0.008 , ' output' : 0.024 },
146- ' claude-instant' : {' input' : 0.0008 , ' output' : 0.0024 },
147116 }
148117
149118 def __init__ (self , model : str = ' gpt-3.5-turbo' ):
150119 self .model = model
151120 self .encoding = tiktoken.encoding_for_model(model)
152- self .metrics: list[CostMetrics] = []
153121
154122 def count_tokens (self , text : str ) -> int :
155- """ Accurate token counting using tiktoken"""
156123 return len (self .encoding.encode(text))
157124
158- def estimate_cost (self , input_text : str , output_text : str ) -> CostMetrics:
159- """ Calculate actual costs for a request"""
125+ def estimate_cost (self , input_text : str , output_text : str ) -> dict :
160126 input_tokens = self .count_tokens(input_text)
161127 output_tokens = self .count_tokens(output_text)
162-
163128 pricing = self .MODEL_PRICING [self .model]
164- input_cost = (input_tokens / 1000 ) * pricing[' input' ]
165- output_cost = (output_tokens / 1000 ) * pricing[' output' ]
166-
167- return CostMetrics(
168- input_tokens = input_tokens,
169- output_tokens = output_tokens,
170- input_cost = input_cost,
171- output_cost = output_cost,
172- total_cost = input_cost + output_cost,
173- model = self .model,
174- timestamp = time.time(),
175- request_id = self ._generate_request_id()
176- )
177-
178- def check_budget (self , prompt : str , budget : TokenBudget) -> tuple[bool , int ]:
179- """ Validate prompt fits within budget"""
180- token_count = self .count_tokens(prompt)
181- available = budget.max_input_tokens - budget.reserved_tokens
182129
183- if token_count > available:
184- return False , token_count - available # Returns overflow amount
185- return True , 0
130+ return {
131+ ' input_tokens' : input_tokens,
132+ ' output_tokens' : output_tokens,
133+ ' total_cost' : (input_tokens/ 1000 * pricing[' input' ]) +
134+ (output_tokens/ 1000 * pricing[' output' ])
135+ }
186136
187137 def truncate_to_budget (self , text : str , max_tokens : int ) -> str :
188- """ Intelligently truncate text to fit budget"""
189138 tokens = self .encoding.encode(text)
190139 if len (tokens) <= max_tokens:
191140 return text
141+ return self .encoding.decode(tokens[:max_tokens- 3 ]) + " ..."
192142
193- # Truncate with ellipsis
194- truncated = tokens[:max_tokens- 3 ]
195- return self .encoding.decode(truncated) + " ..."
143+ # Usage
144+ manager = TokenBudgetManager()
145+ prompt = " Analyze this text..."
146+ response = llm.complete(prompt)
196147
197- def track_request (self , metrics : CostMetrics, user_id : str = None ,
198- feature : str = None ):
199- """ Store metrics for analysis"""
200- metrics.user_id = user_id
201- metrics.feature = feature
202- self .metrics.append(metrics)
203-
204- def get_cost_summary (self , hours : int = 24 ) -> Dict:
205- """ Analyze costs over time period"""
206- cutoff = time.time() - (hours * 3600 )
207- recent = [m for m in self .metrics if m.timestamp > cutoff]
208-
209- return {
210- ' total_requests' : len (recent),
211- ' total_cost' : sum (m.total_cost for m in recent),
212- ' total_tokens' : sum (m.input_tokens + m.output_tokens for m in recent),
213- ' avg_cost_per_request' : sum (m.total_cost for m in recent) / len (recent) if recent else 0 ,
214- ' cost_by_feature' : self ._group_by_feature(recent),
215- ' cost_by_user' : self ._group_by_user(recent)
216- }
217-
218- def _generate_request_id (self ) -> str :
219- import uuid
220- return str (uuid.uuid4())
221-
222- def _group_by_feature (self , metrics : list[CostMetrics]) -> Dict:
223- groups = {}
224- for m in metrics:
225- feature = m.feature or ' unknown'
226- if feature not in groups:
227- groups[feature] = {' cost' : 0 , ' requests' : 0 }
228- groups[feature][' cost' ] += m.total_cost
229- groups[feature][' requests' ] += 1
230- return groups
231-
232- def _group_by_user (self , metrics : list[CostMetrics]) -> Dict:
233- groups = {}
234- for m in metrics:
235- user = m.user_id or ' anonymous'
236- if user not in groups:
237- groups[user] = {' cost' : 0 , ' requests' : 0 }
238- groups[user][' cost' ] += m.total_cost
239- groups[user][' requests' ] += 1
240- return groups
241-
242- # Usage example with budget enforcement
243- def process_user_query (query : str , user_id : str ):
244- """ Process query with strict budget control"""
245- budget_manager = TokenBudgetManager(model = ' gpt-3.5-turbo' )
246-
247- # Define budget constraints
248- budget = TokenBudget(
249- max_input_tokens = 1000 ,
250- max_output_tokens = 500 ,
251- max_total_tokens = 1500
252- )
253-
254- # Build prompt
255- system_prompt = " You are a helpful assistant. Be concise."
256- full_prompt = f " { system_prompt} \n\n User: { query} "
257-
258- # Check budget before API call
259- within_budget, overflow = budget_manager.check_budget(full_prompt, budget)
260-
261- if not within_budget:
262- # Truncate user query to fit budget
263- available_tokens = budget.max_input_tokens - budget_manager.count_tokens(system_prompt)
264- query = budget_manager.truncate_to_budget(query, available_tokens)
265- full_prompt = f " { system_prompt} \n\n User: { query} "
266-
267- # Make API call (simulated)
268- response = call_llm_api(full_prompt, max_tokens = budget.max_output_tokens)
269-
270- # Track actual costs
271- metrics = budget_manager.estimate_cost(full_prompt, response)
272- budget_manager.track_request(metrics, user_id = user_id, feature = ' chat' )
273-
274- print (f " Request cost: $ { metrics.total_cost:.4f } " )
275- print (f " Input tokens: { metrics.input_tokens} , Output tokens: { metrics.output_tokens} " )
276-
277- return response
278-
279- # Cost monitoring and alerts
280- def monitor_costs (budget_manager : TokenBudgetManager):
281- """ Alert when costs exceed thresholds"""
282- summary = budget_manager.get_cost_summary(hours = 1 )
283-
284- HOURLY_BUDGET = 5.00 # $5/hour max
285-
286- if summary[' total_cost' ] > HOURLY_BUDGET :
287- alert_message = f """
288- 🚨 COST ALERT: Hourly budget exceeded!
289- Current: $ { summary[' total_cost' ]:.2f }
290- Budget: $ { HOURLY_BUDGET :.2f }
291- Requests: { summary[' total_requests' ]}
292-
293- Top costs by feature:
294- { format_feature_costs(summary[' cost_by_feature' ])}
295- """
296- send_alert(alert_message) # Send to Slack/email/etc
148+ cost = manager.estimate_cost(prompt, response)
149+ print (f " Cost: $ { cost[' total_cost' ]:.4f } , Tokens: { cost[' input_tokens' ]} + { cost[' output_tokens' ]} " )
297150```
298151
152+ > ** 📚 Full Implementation** : See [ token budget system with monitoring] ( https://github.com/jetthoughts/llm-cost-examples/token-tracking ) for production version with request tracking, cost alerts, and per-user/feature attribution (188 lines).
153+
299154** Key Implementation Notes** :
300155
3011561 . ** Use tiktoken for accuracy** : The ` tiktoken ` library provides exact token counts matching OpenAI's tokenization, eliminating estimation errors
0 commit comments