Skip to content

Commit 3029de2

Browse files
committed
feat: Improves slots processing and token rate accuracy
Refactors slots service to improve token rate calculation and avoid unnecessary polling. Implements debouncing for slots updates during streaming to prevent excessive requests. Adds a delay to stop monitoring after streaming to capture final updates. Ensures that slots monitoring starts and stops correctly based on streaming activity.
1 parent 7fd78b6 commit 3029de2

File tree

5 files changed

+171
-85
lines changed

5 files changed

+171
-85
lines changed

tools/server/webui/src/lib/components/app/SlotsInfo.svelte

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,22 @@
11
<script lang="ts">
22
import { useProcessingState } from '$lib/hooks/use-processing-state.svelte';
33
import { isLoading } from '$lib/stores/chat.svelte';
4-
import { onMount } from 'svelte';
54
65
const processingState = useProcessingState();
76
87
let showSlotsInfo = $derived(isLoading());
98
109
let processingDetails = $derived(processingState.getProcessingDetails());
1110
12-
onMount(() => {
13-
processingState.startMonitoring();
14-
15-
return () => {
16-
processingState.stopMonitoring();
17-
};
18-
});
19-
11+
// Monitor during loading and add delay before stopping to capture final updates
2012
$effect(() => {
2113
if (isLoading()) {
2214
processingState.startMonitoring();
2315
} else {
24-
processingState.stopMonitoring();
16+
// Delay stopping to capture final context updates after streaming
17+
setTimeout(() => {
18+
processingState.stopMonitoring();
19+
}, 2000); // 2 second delay to ensure we get final updates
2520
}
2621
});
2722
</script>

tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,18 @@ export function useProcessingState() {
1717
});
1818

1919
try {
20-
await slotsService.startPolling();
20+
// Try to get current state immediately for UI display
21+
const currentState = await slotsService.getCurrentState();
22+
if (currentState) {
23+
processingState = currentState;
24+
}
25+
26+
// Start streaming polling only if streaming is active
27+
if (slotsService.isStreaming()) {
28+
slotsService.startStreamingPolling();
29+
}
2130
} catch (error) {
22-
console.warn('Failed to start slots polling:', error);
31+
console.warn('Failed to start slots monitoring:', error);
2332
// Continue without slots monitoring - graceful degradation
2433
}
2534
}
@@ -34,8 +43,6 @@ export function useProcessingState() {
3443
unsubscribe();
3544
unsubscribe = null;
3645
}
37-
38-
slotsService.stopPolling();
3946
}
4047

4148
function getProcessingMessage(): string {

tools/server/webui/src/lib/services/slots.ts

Lines changed: 128 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -2,58 +2,44 @@ import type { ApiSlotData, ApiProcessingState } from '$lib/types/api';
22
import { serverStore } from '$lib/stores/server.svelte';
33

44
export class SlotsService {
5-
private pollingInterval: number;
6-
private pollingTimer: number | null = null;
75
private callbacks: Set<(state: ApiProcessingState) => void> = new Set();
8-
private slotsAvailable: boolean | null = null;
9-
private slotsEndpointSupported: boolean | null = null;
106
private lastTokenCount: number = 0;
117
private lastTimestamp: number = 0;
8+
private isStreamingActive: boolean = false;
9+
private currentTokensPerSecond: number = 0;
10+
private tokenRateHistory: number[] = [];
11+
private lastUpdateTime: number = 0;
12+
private pendingUpdate: boolean = false;
13+
private streamStartTime: number = 0;
14+
private streamStartTokens: number = 0;
1215

13-
constructor(pollingInterval = 500) {
14-
this.pollingInterval = pollingInterval;
15-
}
16+
constructor() {}
1617

1718
/**
1819
* Check if slots endpoint is available based on server properties and endpoint support
1920
*/
2021
private async isSlotsEndpointAvailable(): Promise<boolean> {
21-
// If we've already determined endpoint support, use cached result
22-
if (this.slotsEndpointSupported !== null) {
23-
return this.slotsEndpointSupported;
24-
}
25-
26-
// First check server properties
2722
const serverProps = serverStore.serverProps;
23+
2824
if (!serverProps) {
29-
this.slotsEndpointSupported = false;
3025
return false;
3126
}
3227

33-
// Check if server has slots support (total_slots > 0)
3428
if (serverProps.total_slots <= 0) {
35-
this.slotsEndpointSupported = false;
3629
return false;
3730
}
3831

39-
// Test if the endpoint is actually implemented
4032
try {
4133
const response = await fetch('/slots');
4234

43-
// Handle 501 Not Implemented specifically
4435
if (response.status === 501) {
4536
console.info('Slots endpoint not implemented - server started without --slots flag');
46-
this.slotsEndpointSupported = false;
4737
return false;
4838
}
4939

50-
// If we get any successful response or other error, assume it's supported
51-
this.slotsEndpointSupported = true;
5240
return true;
5341
} catch (error) {
54-
// Network errors - assume endpoint might be supported but server is down
5542
console.warn('Unable to test slots endpoint availability:', error);
56-
this.slotsEndpointSupported = false;
5743
return false;
5844
}
5945
}
@@ -62,33 +48,87 @@ export class SlotsService {
6248
* Reset slots availability check (call when server properties change)
6349
*/
6450
resetAvailabilityCheck(): void {
65-
this.slotsAvailable = null;
66-
this.slotsEndpointSupported = null;
6751
}
6852

69-
async startPolling(): Promise<void> {
70-
if (this.pollingTimer) {
53+
/**
54+
* Start streaming session tracking
55+
*/
56+
startStreamingPolling(): void {
57+
this.isStreamingActive = true;
58+
this.streamStartTime = Date.now();
59+
this.streamStartTokens = 0;
60+
this.currentTokensPerSecond = 0;
61+
this.tokenRateHistory = [];
62+
}
63+
64+
/**
65+
* Stop streaming session tracking
66+
*/
67+
stopStreamingPolling(): void {
68+
this.isStreamingActive = false;
69+
this.lastTokenCount = 0;
70+
this.lastTimestamp = 0;
71+
this.currentTokensPerSecond = 0;
72+
this.tokenRateHistory = [];
73+
this.lastUpdateTime = 0;
74+
this.pendingUpdate = false;
75+
this.streamStartTime = 0;
76+
this.streamStartTokens = 0;
77+
}
78+
79+
/**
80+
* Check if currently in a streaming session
81+
*/
82+
isStreaming(): boolean {
83+
return this.isStreamingActive;
84+
}
85+
86+
/**
87+
* Fetch and update slots state on demand (called during streaming chunks)
88+
* Debounced to prevent excessive requests during high-frequency streaming
89+
*/
90+
async updateSlotsState(): Promise<void> {
91+
if (!this.isStreamingActive) {
7192
return;
7293
}
7394

74-
// Only start polling if slots endpoint is available
75-
const isAvailable = await this.isSlotsEndpointAvailable();
76-
if (!isAvailable) {
77-
console.info('Slots endpoint not available - polling disabled');
95+
const currentTime = Date.now();
96+
const timeSinceLastUpdate = currentTime - this.lastUpdateTime;
97+
98+
// For the first few calls, use shorter debouncing to get tokens/sec faster
99+
const debounceTime = this.tokenRateHistory.length < 2 ? 50 : 100;
100+
101+
if (timeSinceLastUpdate < debounceTime) {
102+
if (!this.pendingUpdate) {
103+
this.pendingUpdate = true;
104+
setTimeout(async () => {
105+
this.pendingUpdate = false;
106+
await this.performUpdate();
107+
}, debounceTime - timeSinceLastUpdate);
108+
}
78109
return;
79110
}
80111

81-
this.poll();
82-
this.pollingTimer = window.setInterval(() => {
83-
this.poll();
84-
}, this.pollingInterval);
112+
await this.performUpdate();
85113
}
86114

87-
stopPolling(): void {
88-
if (this.pollingTimer) {
89-
clearInterval(this.pollingTimer);
90-
this.pollingTimer = null;
115+
116+
/**
117+
* Perform the actual slots state update
118+
*/
119+
private async performUpdate(): Promise<void> {
120+
if (!this.isStreamingActive) {
121+
return;
122+
}
123+
124+
const isAvailable = await this.isSlotsEndpointAvailable();
125+
126+
if (!isAvailable) {
127+
return;
91128
}
129+
130+
this.lastUpdateTime = Date.now();
131+
await this.fetchAndNotify();
92132
}
93133

94134
subscribe(callback: (state: ApiProcessingState) => void): () => void {
@@ -98,15 +138,12 @@ export class SlotsService {
98138
};
99139
}
100140

101-
private async poll(): Promise<void> {
141+
private async fetchAndNotify(): Promise<void> {
102142
try {
103143
const response = await fetch(`/slots`);
104144

105-
// Handle 501 Not Implemented - stop polling and mark as unsupported
106145
if (response.status === 501) {
107-
console.info('Slots endpoint not implemented - stopping polling');
108-
this.slotsEndpointSupported = false;
109-
this.stopPolling();
146+
console.info('Slots endpoint not implemented');
110147
return;
111148
}
112149

@@ -118,6 +155,7 @@ export class SlotsService {
118155
const slots: ApiSlotData[] = await response.json();
119156
const processingState = this.parseProcessingState(slots);
120157

158+
121159
this.callbacks.forEach(callback => {
122160
try {
123161
callback(processingState);
@@ -126,7 +164,7 @@ export class SlotsService {
126164
}
127165
});
128166
} catch (error) {
129-
console.warn('Error polling slots:', error);
167+
console.warn('Error fetching slots:', error);
130168
}
131169
}
132170

@@ -158,31 +196,59 @@ export class SlotsService {
158196
status = 'preparing';
159197
}
160198

161-
// Calculate context usage (estimate based on prompt length and decoded tokens)
162-
const promptTokens = Math.floor(activeSlot.prompt.length / 4); // Rough estimate
199+
const promptTokens = Math.floor(activeSlot.prompt.length / 4);
163200
const contextUsed = promptTokens + activeSlot.next_token.n_decoded;
164201

165-
// Calculate tokens per second
166-
let tokensPerSecond = 0;
167202
const currentTime = Date.now();
168203
const currentTokens = activeSlot.next_token.n_decoded;
169204

170-
if (status === 'generating' && this.lastTimestamp > 0 && currentTokens > this.lastTokenCount) {
171-
const timeDiff = (currentTime - this.lastTimestamp) / 1000; // Convert to seconds
172-
const tokenDiff = currentTokens - this.lastTokenCount;
173-
if (timeDiff > 0) {
174-
tokensPerSecond = tokenDiff / timeDiff;
205+
if (this.isStreamingActive) {
206+
// Initialize stream tracking on first call
207+
if (this.streamStartTokens === 0 && currentTokens > 0) {
208+
this.streamStartTokens = currentTokens;
209+
this.streamStartTime = currentTime;
175210
}
211+
212+
// Calculate tokens/sec using multiple methods for reliability
213+
let calculatedRate = 0;
214+
215+
// Method 1: Use recent interval (preferred for accuracy)
216+
if (this.lastTimestamp > 0 && currentTokens > this.lastTokenCount) {
217+
const timeDiff = (currentTime - this.lastTimestamp) / 1000;
218+
const tokenDiff = currentTokens - this.lastTokenCount;
219+
220+
if (timeDiff > 0.02) {
221+
calculatedRate = tokenDiff / timeDiff;
222+
}
223+
}
224+
225+
// Method 2: Use total stream time (fallback for early display)
226+
if (calculatedRate === 0 && this.streamStartTime > 0 && currentTokens > this.streamStartTokens) {
227+
const totalTimeDiff = (currentTime - this.streamStartTime) / 1000;
228+
const totalTokenDiff = currentTokens - this.streamStartTokens;
229+
230+
if (totalTimeDiff > 0.1) { // At least 100ms of streaming
231+
calculatedRate = totalTokenDiff / totalTimeDiff;
232+
}
233+
}
234+
235+
// Update rate if we have a valid calculation
236+
if (calculatedRate > 0) {
237+
this.tokenRateHistory.push(calculatedRate);
238+
if (this.tokenRateHistory.length > 5) {
239+
this.tokenRateHistory.shift();
240+
}
241+
242+
this.currentTokensPerSecond = this.tokenRateHistory.reduce((sum, rate) => sum + rate, 0) / this.tokenRateHistory.length;
243+
}
244+
245+
// Always show some rate during active streaming (even if 0 initially)
246+
// This ensures the UI always displays tokens/sec field during streaming
176247
}
177248

178-
// Update tracking for next calculation
179-
if (status === 'generating') {
249+
if (this.isStreamingActive && currentTokens >= this.lastTokenCount) {
180250
this.lastTokenCount = currentTokens;
181251
this.lastTimestamp = currentTime;
182-
} else if (status === 'idle') {
183-
// Reset when idle
184-
this.lastTokenCount = 0;
185-
this.lastTimestamp = 0;
186252
}
187253

188254
return {
@@ -195,24 +261,22 @@ export class SlotsService {
195261
topP: activeSlot.params.top_p,
196262
speculative: activeSlot.speculative,
197263
hasNextToken: activeSlot.next_token.has_next_token,
198-
tokensPerSecond
264+
tokensPerSecond: this.currentTokensPerSecond
199265
};
200266
}
201267

202268
async getCurrentState(): Promise<ApiProcessingState | null> {
203-
// Check if slots endpoint is available before making request
204269
const isAvailable = await this.isSlotsEndpointAvailable();
270+
205271
if (!isAvailable) {
206272
return null;
207273
}
208274

209275
try {
210276
const response = await fetch(`/slots`);
211277

212-
// Handle 501 Not Implemented
213278
if (response.status === 501) {
214279
console.info('Slots endpoint not implemented');
215-
this.slotsEndpointSupported = false;
216280
return null;
217281
}
218282

0 commit comments

Comments
 (0)