diff --git a/server/utils/AiProviders/ollama/index.js b/server/utils/AiProviders/ollama/index.js index b88a8121870..7639c2e946a 100644 --- a/server/utils/AiProviders/ollama/index.js +++ b/server/utils/AiProviders/ollama/index.js @@ -263,6 +263,8 @@ class OllamaAILLM { prompt_tokens: res.prompt_eval_count, completion_tokens: res.eval_count, total_tokens: res.prompt_eval_count + res.eval_count, + // Override duration with Ollama's eval_duration for more accurate outputTps + duration: res.eval_duration / 1e9, }, }; }) @@ -282,8 +284,9 @@ class OllamaAILLM { prompt_tokens: result.output.usage.prompt_tokens, completion_tokens: result.output.usage.completion_tokens, total_tokens: result.output.usage.total_tokens, - outputTps: result.output.usage.completion_tokens / result.duration, - duration: result.duration, + outputTps: + result.output.usage.completion_tokens / result.output.usage.duration, + duration: result.output.usage.duration, }, }; } @@ -349,6 +352,8 @@ class OllamaAILLM { if (chunk.done) { usage.prompt_tokens = chunk.prompt_eval_count; usage.completion_tokens = chunk.eval_count; + // Override duration with Ollama's eval_duration for more accurate outputTps + usage.duration = chunk.eval_duration / 1e9; writeResponseChunk(response, { uuid, sources, diff --git a/server/utils/helpers/chat/LLMPerformanceMonitor.js b/server/utils/helpers/chat/LLMPerformanceMonitor.js index bd02863edfe..f18a4278d9d 100644 --- a/server/utils/helpers/chat/LLMPerformanceMonitor.js +++ b/server/utils/helpers/chat/LLMPerformanceMonitor.js @@ -39,6 +39,8 @@ class LLMPerformanceMonitor { } /** * Wraps a function and logs the duration (in seconds) of the function call. + * If the output contains a `usage.duration` property, it will be used instead of the calculated duration. + * This allows providers to supply more accurate timing information. * @param {Function} func * @returns {Promise<{output: any, duration: number}>} */ @@ -47,7 +49,11 @@ class LLMPerformanceMonitor { const start = Date.now(); const output = await func; // is a promise const end = Date.now(); - return { output, duration: (end - start) / 1000 }; + const calculatedDuration = (end - start) / 1000; + // Use duration from output.usage if provided (for providers that have more accurate timing) + // otherwise use the calculated duration from function start/end times + const duration = output?.usage?.duration ?? calculatedDuration; + return { output, duration }; })(); } @@ -88,8 +94,13 @@ class LLMPerformanceMonitor { stream.metrics.total_tokens = stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0); - stream.metrics.outputTps = stream.metrics.completion_tokens / duration; - stream.metrics.duration = duration; + + // Use duration from reportedUsage if provided (for providers that have more accurate timing) + // otherwise use the calculated duration from stream start/end times + const effectiveDuration = reportedUsage.duration || duration; + stream.metrics.outputTps = + stream.metrics.completion_tokens / effectiveDuration; + stream.metrics.duration = effectiveDuration; return stream.metrics; }; return stream;