support a dynamic default max_tokens for VLLM backend

flpanbin · flpanbin · commit 0b2e0cb7aab3 · 2025-11-07T00:56:55.000Z
Signed-off-by: bin &lt;bin.pan@daocloud.io&gt;
diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
@@ -29,7 +29,7 @@
 
 
 def build_sampling_params(
-    request: Dict[str, Any], default_sampling_params: Dict[str, Any]
+    request: Dict[str, Any], default_sampling_params: Dict[str, Any], model_max_len: int | None = None,
 ) -> SamplingParams:
     """
     Build SamplingParams from a PreprocessedRequest.
@@ -56,6 +56,21 @@ def build_sampling_params(
             if key == "stop":
                 continue
             setattr(sampling_params, key, value)
+   
+   # If max_tokens wasn't provided (None or missing), compute a dynamic default
+    try:
+        provided_max_tokens = request.get("stop_conditions", {}).get("max_tokens", None)
+        token_ids = request.get("token_ids", [])
+        input_length = len(token_ids)
+        if (
+            model_max_len is not None
+            and (provided_max_tokens is None)
+        ):
+            # Ensure at least 1 token generation by default when possible
+            dynamic_default = max(1, model_max_len - input_length)
+            sampling_params.max_tokens = dynamic_default
+    except Exception:
+        pass
 
     return sampling_params
 
@@ -65,14 +80,16 @@ class BaseWorkerHandler(ABC):
     Request handler for the generate and clear_kv_blocks endpoints.
     """
 
-    def __init__(self, runtime, component, engine, default_sampling_params):
+    def __init__(self, runtime, component, engine, default_sampling_params, model_max_len: int | None = None):
         self.runtime = runtime
         self.component = component
         self.engine_client = engine
         self.default_sampling_params = default_sampling_params
         self.kv_publishers: list[ZmqKvEventPublisher] | None = None
         self.engine_monitor = VllmEngineMonitor(runtime, engine)
         self.image_loader = ImageLoader()
+        self.model_max_len = model_max_len
+    
 
     @abstractmethod
     async def generate(self, request, context) -> AsyncGenerator[dict, None]:
@@ -212,8 +229,9 @@ def __init__(
         component,
         engine,
         default_sampling_params,
+        model_max_len: int | None = None,
     ):
-        super().__init__(runtime, component, engine, default_sampling_params)
+        super().__init__(runtime, component, engine, default_sampling_params, model_max_len)
 
     async def generate(self, request, context):
         # Use context ID for request tracking and correlation
@@ -228,7 +246,7 @@ async def generate(self, request, context):
         )
 
         # Build sampling params from request
-        sampling_params = build_sampling_params(request, self.default_sampling_params)
+        sampling_params = build_sampling_params(request, self.default_sampling_params, self.model_max_len)
 
         # Extract disaggregated_params from request (set by prefill router in Rust frontend)
         disaggregated_params = request.get("disaggregated_params")
@@ -259,8 +277,8 @@ async def generate(self, request, context):
 
 
 class PrefillWorkerHandler(BaseWorkerHandler):
-    def __init__(self, runtime, component, engine, default_sampling_params):
-        super().__init__(runtime, component, engine, default_sampling_params)
+    def __init__(self, runtime, component, engine, default_sampling_params, model_max_len: int | None = None):
+        super().__init__(runtime, component, engine, default_sampling_params, model_max_len)
 
     async def generate(self, request, context):
         # Use context ID for request tracking and correlation with decode phase
@@ -276,7 +294,7 @@ async def generate(self, request, context):
         )
 
         # Build sampling params from request using shared utility
-        sampling_params = build_sampling_params(request, self.default_sampling_params)
+        sampling_params = build_sampling_params(request, self.default_sampling_params, self.model_max_len)
 
         # Configure for prefill-only mode with remote decode
         if sampling_params.extra_args is None:
diff --git a/components/src/dynamo/vllm/main.py b/components/src/dynamo/vllm/main.py
@@ -317,7 +317,11 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
     engine_client, vllm_config, default_sampling_params = setup_vllm_engine(config)
 
     handler = PrefillWorkerHandler(
-        runtime, component, engine_client, default_sampling_params
+        runtime,
+        component,
+        engine_client,
+        default_sampling_params,
+        getattr(getattr(vllm_config, "model_config", None), "max_model_len", None),
     )
 
     # Check if kv event consolidator is enabled (port was allocated in setup_vllm_engine)
@@ -424,6 +428,7 @@ async def init(runtime: DistributedRuntime, config: Config):
         component,
         engine_client,
         default_sampling_params,
+        getattr(getattr(vllm_config, "model_config", None), "max_model_len", None),
     )
 
     # Check if kv event consolidator is enabled (port was allocated in setup_vllm_engine)