fix(vllm): avoid unnecessary incremental processing in non-streaming mode

guicho271828 · guicho271828 · commit d274720199fc · 2025-09-24T15:23:50.000-04:00
diff --git a/mellea/backends/vllm.py b/mellea/backends/vllm.py
@@ -275,7 +275,13 @@ def _generate_from_context_standard(
                 **self._make_backend_specific_and_remove(
                     model_options, vllm.SamplingParams
                 ),
-                output_kind=vllm.sampling_params.RequestOutputKind.DELTA,  # returns results incrementally
+                output_kind=(
+                    # returns results incrementally
+                    vllm.sampling_params.RequestOutputKind.DELTA
+                    if model_options.get(ModelOption.STREAM, False)
+                    # returns only the final result
+                    else vllm.sampling_params.RequestOutputKind.FINAL_ONLY
+                ),
             )
 
             if format is not None: