bytedance-iaas
diff --git a/‎components/metrics/src/lib.rs‎
Lines changed: 6 additions & 3 deletions b/‎components/metrics/src/lib.rs‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎components/router/src/main.rs‎
Lines changed: 1 addition & 1 deletion b/‎components/router/src/main.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch‎
Lines changed: 48 additions & 30 deletions b/‎container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch‎
Lines changed: 48 additions & 30 deletions
diff --git a/‎docs/architecture/kv_cache_routing.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/architecture/kv_cache_routing.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/guides/dynamo_run.md‎
Lines changed: 3 additions & 1 deletion b/‎docs/guides/dynamo_run.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎docs/guides/kv_router_perf_tuning.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/guides/kv_router_perf_tuning.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 13 additions & 10 deletions b/‎launch/dynamo-run/src/flags.rs‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎launch/dynamo-run/src/subprocess/vllm_inc.py‎
Lines changed: 30 additions & 9 deletions b/‎launch/dynamo-run/src/subprocess/vllm_inc.py‎
Lines changed: 30 additions & 9 deletions
diff --git a/‎lib/bindings/python/rust/lib.rs‎
Lines changed: 0 additions & 1 deletion b/‎lib/bindings/python/rust/lib.rs‎
Lines changed: 0 additions & 1 deletion
@@ -83,7 +83,7 @@ use serde::{Deserialize, Serialize};
 use std::net::SocketAddr;
 use std::time::Duration as StdDuration;
 
-use dynamo_llm::kv_router::protocols::ForwardPassMetrics;
+use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, LoadMetrics};
 use dynamo_llm::kv_router::scheduler::Endpoint;
 use dynamo_llm::kv_router::scoring::ProcessedEndpoints;
 
@@ -449,7 +449,10 @@ impl PrometheusMetrics {
         // Update per-worker metrics
         for (worker_id, endpoint) in processed.endpoints.iter() {
             let worker_id = worker_id.to_string();
-            let metrics = endpoint.data.clone();
+            let load_metrics = endpoint.data.clone();
+            let LoadMetrics::EngineLoadMetrics(metrics) = load_metrics else {
+                panic!("Can only update with ForwardPassMetrics");
+            };
 
             self.set_worker_gauge(
                 &self.kv_blocks_active,
@@ -602,7 +605,7 @@ pub fn postprocess_metrics(
             e.id().ok().map(|id| Endpoint {
                 name: format!("worker-{id}"),
                 subject: e.subject.clone(),
-                data: m.clone(),
+                data: LoadMetrics::EngineLoadMetrics(m.clone()),
             })
         })
         .collect();
 
@@ -66,7 +66,7 @@ async fn app(runtime: Runtime) -> Result<()> {
 
     let selector = Box::new(CustomWorkerSelector::default());
 
-    let router = KvRouter::new(component.clone(), args.block_size, Some(selector)).await?;
+    let router = KvRouter::new(component.clone(), args.block_size, Some(selector), true).await?;
     let router = Ingress::for_engine(Arc::new(router))?;
 
     component
 
@@ -3392,14 +3392,8 @@ index cafd8150b..6a5e45b4e 100644
 +    num_requests_waiting: int
 +    gpu_cache_usage_perc: float
 +    gpu_prefix_cache_hit_rate: float
-+    spec_decode_draft_acceptance_rate: Optional[float] = None
-+    spec_decode_system_efficiency: Optional[float] = None
-+    spec_decode_draft_tokens: Optional[int] = None
-+    spec_decode_emitted_tokens: Optional[int] = None
-+    spec_decode_accepted_tokens: Optional[int] = None
-+    spec_decode_num_spec_tokens: Optional[int] = None
 diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
-index f058b1329..2fdb5b8bf 100644
+index f058b1329..fd5610a3c 100644
 --- a/vllm/engine/multiprocessing/client.py
 +++ b/vllm/engine/multiprocessing/client.py
@@ -1,4 +1,17 @@
@@ -3460,24 +3454,33 @@ index f058b1329..2fdb5b8bf 100644
  from vllm.engine.protocol import EngineClient
  # yapf: enable
  from vllm.envs import VLLM_RPC_TIMEOUT
-@@ -48,6 +66,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
+@@ -48,6 +66,17 @@ from vllm.prompt_adapter.request import PromptAdapterRequest
  from vllm.sampling_params import SamplingParams
  from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
  from vllm.utils import Device, deprecate_kwargs
 +from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest, RemotePrefillRequestCallback
 +from vllm.distributed.device_communicators.nixl import NixlMetadata
++
++# Import ForwardPassMetrics and related classes from dynamo
++try:
++    from dynamo.llm import ForwardPassMetrics, WorkerStats, KvStats
++except ImportError:
++    # Fallback if dynamo imports are not available
++    ForwardPassMetrics = None
++    WorkerStats = None
++    KvStats = None
 
  logger = init_logger(__name__)
 
-@@ -93,6 +113,7 @@ class MQLLMEngineClient(EngineClient):
+@@ -93,6 +122,7 @@ class MQLLMEngineClient(EngineClient):
          self._errored_with: Optional[BaseException] = None
 
          # Get the configs.
 +        self.vllm_config = engine_config
          self.model_config = engine_config.model_config
          self.decoding_config = engine_config.decoding_config
 
-@@ -117,6 +138,10 @@ class MQLLMEngineClient(EngineClient):
+@@ -117,6 +147,10 @@ class MQLLMEngineClient(EngineClient):
          self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
          self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
 
@@ -3488,7 +3491,7 @@ index f058b1329..2fdb5b8bf 100644
          # IPC path for the data socket.
          self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
 
-@@ -131,8 +156,27 @@ class MQLLMEngineClient(EngineClient):
+@@ -131,8 +165,27 @@ class MQLLMEngineClient(EngineClient):
          # Loop to check health of the LLMEngine periodically.
          # Started after the MQLLMEngine is ready.
          self.health_loop: Optional[asyncio.Task] = None
@@ -3516,7 +3519,7 @@ index f058b1329..2fdb5b8bf 100644
      @staticmethod
      def is_unsupported_config(vllm_config: VllmConfig):
          # Pipeline parallel not yet supported
-@@ -182,6 +226,61 @@ class MQLLMEngineClient(EngineClient):
+@@ -182,6 +235,76 @@ class MQLLMEngineClient(EngineClient):
          except Exception as e:
              self._set_errored(e)
 
@@ -3553,13 +3556,28 @@ index f058b1329..2fdb5b8bf 100644
 +                    if self.metrics_publisher is not None and isinstance(
 +                        metrics, KvMetrics
 +                    ):
-+                        self.metrics_publisher.publish(metrics.request_active_slots,
-+                                                    metrics.request_total_slots,
-+                                                    metrics.kv_active_blocks,
-+                                                    metrics.kv_total_blocks,
-+                                                    metrics.num_requests_waiting, 
-+                                                    metrics.gpu_cache_usage_perc, 
-+                                                    metrics.gpu_prefix_cache_hit_rate)
++                        # Construct structured metrics objects
++                        worker_stats = WorkerStats(
++                            request_active_slots=metrics.request_active_slots,
++                            request_total_slots=metrics.request_total_slots,
++                            num_requests_waiting=metrics.num_requests_waiting,
++                            data_parallel_rank=None
++                        )
++                        
++                        kv_stats = KvStats(
++                            kv_active_blocks=metrics.kv_active_blocks,
++                            kv_total_blocks=metrics.kv_total_blocks,
++                            gpu_cache_usage_perc=metrics.gpu_cache_usage_perc,
++                            gpu_prefix_cache_hit_rate=metrics.gpu_prefix_cache_hit_rate
++                        )
++                        
++                        forward_pass_metrics = ForwardPassMetrics(
++                            worker_stats=worker_stats,
++                            kv_stats=kv_stats,
++                            spec_decode_stats=None
++                        )
++                        
++                        self.metrics_publisher.publish(forward_pass_metrics)
 +                        logger.debug("Metrics successful.")
 +
 +                    # TODO: Investigate sending whole stats object
@@ -3578,7 +3596,7 @@ index f058b1329..2fdb5b8bf 100644
      async def run_output_handler_loop(self):
          """Get RequestOutputs from Engine and stream to Request Queues"""
 
-@@ -250,7 +349,7 @@ class MQLLMEngineClient(EngineClient):
+@@ -250,7 +373,7 @@ class MQLLMEngineClient(EngineClient):
                  # Put each output into the appropriate queue.
                  elif isinstance(
                          request_outputs,
@@ -3587,7 +3605,7 @@ index f058b1329..2fdb5b8bf 100644
                      self._add_output(request_outputs)
                  else:
                      for request_output in request_outputs:
-@@ -261,7 +360,7 @@ class MQLLMEngineClient(EngineClient):
+@@ -261,7 +384,7 @@ class MQLLMEngineClient(EngineClient):
 
      def _add_output(self, request_output: Union[RequestOutput,
                                                  RPCAdapterLoadedResponse,
@@ -3596,7 +3614,7 @@ index f058b1329..2fdb5b8bf 100644
          queue = self.output_queues.get(request_output.request_id)
          if queue is not None:
              queue.put_nowait(request_output)
-@@ -283,12 +382,25 @@ class MQLLMEngineClient(EngineClient):
+@@ -283,12 +406,25 @@ class MQLLMEngineClient(EngineClient):
              # Wait until server is ready.
              response = await self._wait_for_server_rpc(socket)
 
@@ -3622,7 +3640,7 @@ index f058b1329..2fdb5b8bf 100644
 
      def close(self):
          """Destroy the ZeroMQ Context."""
-@@ -298,6 +410,8 @@ class MQLLMEngineClient(EngineClient):
+@@ -298,6 +434,8 @@ class MQLLMEngineClient(EngineClient):
          # Cancel background tasks.
          if self.health_loop is not None:
              self.health_loop.cancel()
@@ -3631,7 +3649,7 @@ index f058b1329..2fdb5b8bf 100644
          if self.output_loop is not None:
              self.output_loop.cancel()
 
-@@ -420,6 +534,9 @@ class MQLLMEngineClient(EngineClient):
+@@ -420,6 +558,9 @@ class MQLLMEngineClient(EngineClient):
          """
          if self._errored_with is not None:
              raise self._errored_with
@@ -3641,15 +3659,15 @@ index f058b1329..2fdb5b8bf 100644
 
      @property
      def is_running(self) -> bool:
-@@ -478,6 +595,7 @@ class MQLLMEngineClient(EngineClient):
+@@ -478,6 +619,7 @@ class MQLLMEngineClient(EngineClient):
          trace_headers: Optional[Mapping[str, str]] = None,
          prompt_adapter_request: Optional[PromptAdapterRequest] = None,
          priority: int = 0,
 +        remote_prefill_params: Optional[RemotePrefillParams] = None,
          *,
          inputs: Optional[PromptType] = None  # DEPRECATED
      ) -> AsyncGenerator[RequestOutput, None]:
-@@ -507,7 +625,8 @@ class MQLLMEngineClient(EngineClient):
+@@ -507,7 +649,8 @@ class MQLLMEngineClient(EngineClient):
 
          return self._process_request(prompt, sampling_params, request_id,
                                       lora_request, trace_headers,
@@ -3659,15 +3677,15 @@ index f058b1329..2fdb5b8bf 100644
 
      @overload
      def encode(
-@@ -591,6 +710,7 @@ class MQLLMEngineClient(EngineClient):
+@@ -591,6 +734,7 @@ class MQLLMEngineClient(EngineClient):
          trace_headers: Optional[Mapping[str, str]] = None,
          prompt_adapter_request: Optional[PromptAdapterRequest] = None,
          priority: int = 0,
 +        remote_prefill_params: Optional[RemotePrefillParams] = None,
      ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
              PoolingRequestOutput, None]]:
          """Send an RPCGenerateRequest to the RPCServer and stream responses."""
-@@ -636,6 +756,12 @@ class MQLLMEngineClient(EngineClient):
+@@ -636,6 +780,12 @@ class MQLLMEngineClient(EngineClient):
              else:
                  lp_bytes = None
 
@@ -3680,7 +3698,7 @@ index f058b1329..2fdb5b8bf 100644
              request_bytes = pickle.dumps(
                  RPCProcessRequest(
                      prompt=prompt,
-@@ -645,11 +771,11 @@ class MQLLMEngineClient(EngineClient):
+@@ -645,11 +795,11 @@ class MQLLMEngineClient(EngineClient):
                      trace_headers=trace_headers,
                      prompt_adapter_request=prompt_adapter_request,
                      priority=priority,
@@ -3694,7 +3712,7 @@ index f058b1329..2fdb5b8bf 100644
              await self.input_socket.send_multipart(parts, copy=False)
 
              # 4) Stream the RequestOutputs from the output queue. Note
-@@ -740,3 +866,22 @@ class MQLLMEngineClient(EngineClient):
+@@ -740,3 +890,22 @@ class MQLLMEngineClient(EngineClient):
          # Raise on error, otherwise happily return None
          if isinstance(request_output, BaseException):
              raise request_output
 
@@ -15,6 +15,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+>[!NOTE]
+>This information is temporary and will change soon.
 
 # KV Cache Routing
 This documentation explains how Key-Value (KV) cache routing works in Dynamo, providing optimized inference for large language models by intelligently directing requests to workers with the most relevant cached data while simultaneously load balancing based on utilization metrics sent by the workers.
 
@@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=1.0] [--router-temperature=0.5] [--verbosity (-v|-vv)]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
@@ -201,6 +201,8 @@ The only difference from the distributed system above is `--router-mode kv`. The
 
 For performance testing, compare a typical workload with `--router-mode random|round-robin` to see if it can benefit from KV-aware routing.
 
+The argument `--kv-overlap-score-weight` sets the amount weighting on overlaps with prefix caches, which directly contributes to the prefill cost, so a large weight is expected to yield a better TTFT (at the expense of worse ITL). When this is set 0, we do not consider the prefix caches at all (falling back to pure load balancing behavior on the active blocks), in which case we do not require the backend engines to emit any KV events. The argument `--router-temperature` sets the temperature when randomly selecting the workers to route to via softmax sampling on the router cost logits, setting it to 0 recovers the deterministic behavior where the min logit is picked.
+
 ## Full usage details
 
 `dynamo run` executes `dynamo-run`. `dynamo-run` is also an example of what can be built in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide shows how to build from source with all the features.
 
@@ -15,6 +15,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+>[!NOTE]
+>This information is temporary and will change soon.
+
 # KV Router Performance Tuning
 
 ## Overview
 
@@ -110,20 +110,23 @@ pub struct Flags {
     #[arg(long, default_value = "round-robin")]
     pub router_mode: RouterMode,
 
+    /// Maximum number of batched tokens for KV routing
+    /// Needed for informing the KV router
+    /// TODO: derive from vllm args
+    /// NOTE: this is not actually used for now
+    #[arg(long, default_value = "8192")]
+    pub max_num_batched_tokens: Option<u32>,
+
     /// KV Router: Weight for overlap score in worker selection.
     /// Higher values prioritize KV cache reuse. Default: 2.0
     #[arg(long)]
     pub kv_overlap_score_weight: Option<f64>,
 
-    /// KV Router: Weight for GPU cache usage in worker selection.
-    /// Higher values avoid workers with nearly full KV caches. Default: 1.0
-    #[arg(long)]
-    pub kv_gpu_cache_usage_weight: Option<f64>,
-
-    /// KV Router: Weight for waiting requests in worker selection.
-    /// Higher values avoid workers with queued requests. Default: 1.0
+    /// KV Router: Temperature for worker sampling via softmax.
+    /// Higher values promote more randomness, and 0 fallbacks to deterministic.
+    /// Default: 0.5
     #[arg(long)]
-    pub kv_waiting_requests_weight: Option<f64>,
+    pub router_temperature: Option<f64>,
 
     /// Max model context length. Reduce this if you don't have enough VRAM for the full model
     /// context length (e.g. Llama 4).
@@ -211,8 +214,8 @@ impl Flags {
             self.router_mode.into(),
             KvRouterConfig::new(
                 self.kv_overlap_score_weight,
-                self.kv_gpu_cache_usage_weight,
-                self.kv_waiting_requests_weight,
+                self.router_temperature,
+                self.max_num_batched_tokens,
             ),
         )
     }
 
@@ -26,7 +26,14 @@
 )
 from vllm.inputs import TokensPrompt
 
-from dynamo.llm import ModelType, WorkerMetricsPublisher, register_llm
+from dynamo.llm import (
+    ForwardPassMetrics,
+    KvStats,
+    ModelType,
+    WorkerMetricsPublisher,
+    WorkerStats,
+    register_llm,
+)
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
 
@@ -70,15 +77,29 @@ def setup_kv_metrics(self):
         self.engine_client.set_metrics_publisher(self.metrics_publisher)
         # Initially send dummy metrics to kick start,
         # vLLM will not update stat until forward pass is triggered
-        self.metrics_publisher.publish(
-            0,  # request_active_slots
-            1024,  # request_total_slots
-            0,  # kv_active_blocks
-            1024,  # kv_total_blocks
-            0,  # num_requests_waiting
-            0.0,  # gpu_cache_usage_perc
-            0.0,  # gpu_prefix_cache_hit_rate
+
+        # Create the structured metrics objects
+        worker_stats = WorkerStats(
+            request_active_slots=0,
+            request_total_slots=1024,
+            num_requests_waiting=0,
+            data_parallel_rank=None,
+        )
+
+        kv_stats = KvStats(
+            kv_active_blocks=0,
+            kv_total_blocks=1024,
+            gpu_cache_usage_perc=0.0,
+            gpu_prefix_cache_hit_rate=0.0,
         )
+
+        metrics = ForwardPassMetrics(
+            worker_stats=worker_stats, kv_stats=kv_stats, spec_decode_stats=None
+        )
+
+        # Publish the metrics as a single object
+        self.metrics_publisher.publish(metrics)
+
         task = asyncio.create_task(self.create_metrics_publisher_endpoint())
         task.add_done_callback(
             lambda _: logging.debug("metrics publisher endpoint created")
 
@@ -72,7 +72,6 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<Client>()?;
     m.add_class::<EtcdClient>()?;
     m.add_class::<AsyncResponseStream>()?;
-    m.add_class::<llm::kv::KvRouter>()?;
     m.add_class::<llm::disagg_router::DisaggregatedRouter>()?;
     m.add_class::<llm::kv::WorkerMetricsPublisher>()?;
     m.add_class::<llm::model_card::ModelDeploymentCard>()?;