feat: Improves token usage tracking and display

allozaur · allozaur · commit c6223b307d54 · 2025-09-01T15:37:09.000+02:00
Updates token usage calculation to reflect new slots API format, differentiating between context and output tokens.

Displays output token usage in the processing state details.
diff --git a/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts b/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts
@@ -93,6 +93,15 @@ export function useProcessingState() {
 			);
 		}
 
+		if (processingState.outputTokensUsed > 0) {
+			const outputPercent = Math.round(
+				(processingState.outputTokensUsed / processingState.outputTokensMax) * 100
+			);
+			details.push(
+				`Output: ${processingState.outputTokensUsed}/${processingState.outputTokensMax} (${outputPercent}%)`
+			);
+		}
+
 		if (
 			currentConfig.showTokensPerSecond &&
 			processingState.tokensPerSecond &&
@@ -101,14 +110,6 @@ export function useProcessingState() {
 			details.push(`${processingState.tokensPerSecond.toFixed(1)} tokens/sec`);
 		}
 
-		if (processingState.temperature !== SETTING_CONFIG_DEFAULT.temperature) {
-			details.push(`Temperature: ${processingState.temperature.toFixed(1)}`);
-		}
-
-		if (processingState.topP !== SETTING_CONFIG_DEFAULT.top_p) {
-			details.push(`Top-p: ${processingState.topP.toFixed(2)}`);
-		}
-
 		if (processingState.speculative) {
 			details.push('Speculative decoding enabled');
 		}
diff --git a/tools/server/webui/src/lib/services/slots.ts b/tools/server/webui/src/lib/services/slots.ts
@@ -196,6 +196,8 @@ export class SlotsService {
 				tokensRemaining: 0,
 				contextUsed: 0,
 				contextTotal: 4096,
+				outputTokensUsed: 0,
+				outputTokensMax: 2048,
 				temperature: 0.8,
 				topP: 0.95,
 				speculative: false,
@@ -214,8 +216,26 @@ export class SlotsService {
 			status = 'preparing';
 		}
 
-		const promptTokens = Math.floor(activeSlot.prompt.length / 4);
-		const contextUsed = promptTokens + activeSlot.next_token.n_decoded;
+		// Calculate context and output token usage with the new slots format
+		// n_decoded represents ALL tokens generated (thinking + regular content)
+		const totalTokensGenerated = activeSlot.next_token.n_decoded;
+		const maxOutputTokens = activeSlot.params.max_tokens || activeSlot.params.n_predict;
+		
+		// For context calculation: only count tokens that will be sent back to API
+		// We need to estimate how many of the generated tokens are actual message content
+		// vs thinking content. For now, we'll assume thinking is ~60% of total output
+		// This is a rough estimate - in reality we'd need to track this separately
+		const estimatedThinkingRatio = 0.6;
+		const estimatedMessageTokens = Math.floor(totalTokensGenerated * (1 - estimatedThinkingRatio));
+		
+		// Context used = estimated prompt + only the message content tokens
+		const maxGenerationTokens = Math.min(maxOutputTokens, Math.floor(activeSlot.n_ctx * 0.4));
+		const estimatedPromptTokens = activeSlot.n_ctx - maxGenerationTokens;
+		const contextUsed = Math.min(activeSlot.n_ctx, estimatedPromptTokens + estimatedMessageTokens);
+		
+		// Output tokens: total generated tokens (thinking + regular)
+		const outputTokensUsed = totalTokensGenerated;
+		const outputTokensMax = maxOutputTokens;
 
 		const currentTime = Date.now();
 		const currentTokens = activeSlot.next_token.n_decoded;
@@ -275,6 +295,8 @@ export class SlotsService {
 			tokensRemaining: activeSlot.next_token.n_remain,
 			contextUsed,
 			contextTotal: activeSlot.n_ctx,
+			outputTokensUsed,
+			outputTokensMax,
 			temperature: activeSlot.params.temperature,
 			topP: activeSlot.params.top_p,
 			speculative: activeSlot.speculative,
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
@@ -180,23 +180,16 @@ export interface ApiSlotData {
 		dry_base: number;
 		dry_allowed_length: number;
 		dry_penalty_last_n: number;
-		dry_sequence_breakers: string[];
 		mirostat: number;
 		mirostat_tau: number;
 		mirostat_eta: number;
-		stop: string[];
 		max_tokens: number;
 		n_keep: number;
 		n_discard: number;
 		ignore_eos: boolean;
 		stream: boolean;
-		logit_bias: Array<[number, number]>;
 		n_probs: number;
 		min_keep: number;
-		grammar: string;
-		grammar_lazy: boolean;
-		grammar_triggers: string[];
-		preserved_tokens: number[];
 		chat_format: string;
 		reasoning_format: string;
 		reasoning_in_content: boolean;
@@ -209,13 +202,11 @@ export interface ApiSlotData {
 		post_sampling_probs: boolean;
 		lora: Array<{ name: string; scale: number }>;
 	};
-	prompt: string;
 	next_token: {
 		has_next_token: boolean;
 		has_new_line: boolean;
 		n_remain: number;
 		n_decoded: number;
-		stopping_word: string;
 	};
 }
 
@@ -225,6 +216,8 @@ export interface ApiProcessingState {
 	tokensRemaining: number;
 	contextUsed: number;
 	contextTotal: number;
+	outputTokensUsed: number; // Total output tokens (thinking + regular content)
+	outputTokensMax: number;   // Max output tokens allowed
 	temperature: number;
 	topP: number;
 	speculative: boolean;
diff --git a/tools/server/webui/src/stories/ChatMessage.stories.svelte b/tools/server/webui/src/stories/ChatMessage.stories.svelte
@@ -162,10 +162,12 @@
 		// Mock the slots service to provide realistic slot data
 		const mockProcessingState = {
 			status: 'generating' as const,
-			tokensDecoded: 410,
-			tokensRemaining: 1000,
-			contextUsed: 429,
+			tokensDecoded: 1250,
+			tokensRemaining: 750,
+			contextUsed: 3200,
 			contextTotal: 4096,
+			outputTokensUsed: 1250,
+			outputTokensMax: 2048,
 			temperature: 0.8,
 			topP: 0.95,
 			speculative: false,