Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion server/utils/AiProviders/ollama/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ class OllamaAILLM {
prompt_tokens: res.prompt_eval_count,
completion_tokens: res.eval_count,
total_tokens: res.prompt_eval_count + res.eval_count,
eval_duration: res.eval_duration / 1e9,
},
};
})
Expand All @@ -282,7 +283,10 @@ class OllamaAILLM {
prompt_tokens: result.output.usage.prompt_tokens,
completion_tokens: result.output.usage.completion_tokens,
total_tokens: result.output.usage.total_tokens,
outputTps: result.output.usage.completion_tokens / result.duration,
outputTps:
result.output.usage.completion_tokens /
result.output.usage.eval_duration,
eval_duration: result.output.usage.eval_duration,
duration: result.duration,
},
};
Expand Down Expand Up @@ -349,6 +353,7 @@ class OllamaAILLM {
if (chunk.done) {
usage.prompt_tokens = chunk.prompt_eval_count;
usage.completion_tokens = chunk.eval_count;
usage.eval_duration = chunk.eval_duration / 1e9;
writeResponseChunk(response, {
uuid,
sources,
Expand Down
8 changes: 7 additions & 1 deletion server/utils/helpers/chat/LLMPerformanceMonitor.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ const { TokenManager } = require("../tiktoken");
* @property {number} total_tokens - the total number of tokens
* @property {number} outputTps - the tokens per second of the output
* @property {number} duration - the duration of the stream
* @property {number} [eval_duration] - optional eval duration from providers (e.g., Ollama) used for more accurate outputTps calculation
*/

/**
Expand Down Expand Up @@ -88,7 +89,12 @@ class LLMPerformanceMonitor {

stream.metrics.total_tokens =
stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0);
stream.metrics.outputTps = stream.metrics.completion_tokens / duration;

// Use eval_duration if provided (for providers like Ollama that report it)
// otherwise fall back to total request duration
stream.metrics.outputTps =
stream.metrics.completion_tokens /
(stream.metrics.eval_duration || duration);
stream.metrics.duration = duration;
return stream.metrics;
};
Expand Down