Skip to content

Commit c6223b3

Browse files
committed
feat: Improves token usage tracking and display
Updates token usage calculation to reflect new slots API format, differentiating between context and output tokens. Displays output token usage in the processing state details.
1 parent f0f6f20 commit c6223b3

File tree

4 files changed

+40
-22
lines changed

4 files changed

+40
-22
lines changed

tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,15 @@ export function useProcessingState() {
9393
);
9494
}
9595

96+
if (processingState.outputTokensUsed > 0) {
97+
const outputPercent = Math.round(
98+
(processingState.outputTokensUsed / processingState.outputTokensMax) * 100
99+
);
100+
details.push(
101+
`Output: ${processingState.outputTokensUsed}/${processingState.outputTokensMax} (${outputPercent}%)`
102+
);
103+
}
104+
96105
if (
97106
currentConfig.showTokensPerSecond &&
98107
processingState.tokensPerSecond &&
@@ -101,14 +110,6 @@ export function useProcessingState() {
101110
details.push(`${processingState.tokensPerSecond.toFixed(1)} tokens/sec`);
102111
}
103112

104-
if (processingState.temperature !== SETTING_CONFIG_DEFAULT.temperature) {
105-
details.push(`Temperature: ${processingState.temperature.toFixed(1)}`);
106-
}
107-
108-
if (processingState.topP !== SETTING_CONFIG_DEFAULT.top_p) {
109-
details.push(`Top-p: ${processingState.topP.toFixed(2)}`);
110-
}
111-
112113
if (processingState.speculative) {
113114
details.push('Speculative decoding enabled');
114115
}

tools/server/webui/src/lib/services/slots.ts

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,8 @@ export class SlotsService {
196196
tokensRemaining: 0,
197197
contextUsed: 0,
198198
contextTotal: 4096,
199+
outputTokensUsed: 0,
200+
outputTokensMax: 2048,
199201
temperature: 0.8,
200202
topP: 0.95,
201203
speculative: false,
@@ -214,8 +216,26 @@ export class SlotsService {
214216
status = 'preparing';
215217
}
216218

217-
const promptTokens = Math.floor(activeSlot.prompt.length / 4);
218-
const contextUsed = promptTokens + activeSlot.next_token.n_decoded;
219+
// Calculate context and output token usage with the new slots format
220+
// n_decoded represents ALL tokens generated (thinking + regular content)
221+
const totalTokensGenerated = activeSlot.next_token.n_decoded;
222+
const maxOutputTokens = activeSlot.params.max_tokens || activeSlot.params.n_predict;
223+
224+
// For context calculation: only count tokens that will be sent back to API
225+
// We need to estimate how many of the generated tokens are actual message content
226+
// vs thinking content. For now, we'll assume thinking is ~60% of total output
227+
// This is a rough estimate - in reality we'd need to track this separately
228+
const estimatedThinkingRatio = 0.6;
229+
const estimatedMessageTokens = Math.floor(totalTokensGenerated * (1 - estimatedThinkingRatio));
230+
231+
// Context used = estimated prompt + only the message content tokens
232+
const maxGenerationTokens = Math.min(maxOutputTokens, Math.floor(activeSlot.n_ctx * 0.4));
233+
const estimatedPromptTokens = activeSlot.n_ctx - maxGenerationTokens;
234+
const contextUsed = Math.min(activeSlot.n_ctx, estimatedPromptTokens + estimatedMessageTokens);
235+
236+
// Output tokens: total generated tokens (thinking + regular)
237+
const outputTokensUsed = totalTokensGenerated;
238+
const outputTokensMax = maxOutputTokens;
219239

220240
const currentTime = Date.now();
221241
const currentTokens = activeSlot.next_token.n_decoded;
@@ -275,6 +295,8 @@ export class SlotsService {
275295
tokensRemaining: activeSlot.next_token.n_remain,
276296
contextUsed,
277297
contextTotal: activeSlot.n_ctx,
298+
outputTokensUsed,
299+
outputTokensMax,
278300
temperature: activeSlot.params.temperature,
279301
topP: activeSlot.params.top_p,
280302
speculative: activeSlot.speculative,

tools/server/webui/src/lib/types/api.d.ts

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -180,23 +180,16 @@ export interface ApiSlotData {
180180
dry_base: number;
181181
dry_allowed_length: number;
182182
dry_penalty_last_n: number;
183-
dry_sequence_breakers: string[];
184183
mirostat: number;
185184
mirostat_tau: number;
186185
mirostat_eta: number;
187-
stop: string[];
188186
max_tokens: number;
189187
n_keep: number;
190188
n_discard: number;
191189
ignore_eos: boolean;
192190
stream: boolean;
193-
logit_bias: Array<[number, number]>;
194191
n_probs: number;
195192
min_keep: number;
196-
grammar: string;
197-
grammar_lazy: boolean;
198-
grammar_triggers: string[];
199-
preserved_tokens: number[];
200193
chat_format: string;
201194
reasoning_format: string;
202195
reasoning_in_content: boolean;
@@ -209,13 +202,11 @@ export interface ApiSlotData {
209202
post_sampling_probs: boolean;
210203
lora: Array<{ name: string; scale: number }>;
211204
};
212-
prompt: string;
213205
next_token: {
214206
has_next_token: boolean;
215207
has_new_line: boolean;
216208
n_remain: number;
217209
n_decoded: number;
218-
stopping_word: string;
219210
};
220211
}
221212

@@ -225,6 +216,8 @@ export interface ApiProcessingState {
225216
tokensRemaining: number;
226217
contextUsed: number;
227218
contextTotal: number;
219+
outputTokensUsed: number; // Total output tokens (thinking + regular content)
220+
outputTokensMax: number; // Max output tokens allowed
228221
temperature: number;
229222
topP: number;
230223
speculative: boolean;

tools/server/webui/src/stories/ChatMessage.stories.svelte

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,12 @@
162162
// Mock the slots service to provide realistic slot data
163163
const mockProcessingState = {
164164
status: 'generating' as const,
165-
tokensDecoded: 410,
166-
tokensRemaining: 1000,
167-
contextUsed: 429,
165+
tokensDecoded: 1250,
166+
tokensRemaining: 750,
167+
contextUsed: 3200,
168168
contextTotal: 4096,
169+
outputTokensUsed: 1250,
170+
outputTokensMax: 2048,
169171
temperature: 0.8,
170172
topP: 0.95,
171173
speculative: false,

0 commit comments

Comments
 (0)