ai-dynamo
diff --git a/‎.github/workflows/docs-link-check.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/docs-link-check.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/fern-docs.yml‎
Lines changed: 0 additions & 11 deletions b/‎.github/workflows/fern-docs.yml‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎.github/workflows/pr.yaml‎
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/pr.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎components/src/dynamo/common/configuration/groups/kv_router_args.py‎
Lines changed: 14 additions & 0 deletions b/‎components/src/dynamo/common/configuration/groups/kv_router_args.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎components/src/dynamo/mocker/README.md‎
Lines changed: 1 addition & 0 deletions b/‎components/src/dynamo/mocker/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎components/src/dynamo/mocker/args.py‎
Lines changed: 16 additions & 0 deletions b/‎components/src/dynamo/mocker/args.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎components/src/dynamo/mocker/main.py‎
Lines changed: 5 additions & 4 deletions b/‎components/src/dynamo/mocker/main.py‎
Lines changed: 5 additions & 4 deletions
@@ -44,7 +44,6 @@ jobs:
             --host-concurrency 10
             --host-request-interval 1s
             --host-stats
-             ${{ github.event_name == 'pull_request' && '--offline' || '' }}
             .
           fail: true
         env:
 
@@ -172,25 +172,14 @@ jobs:
           rm -rf docs-checkout/fern/pages-dev
           mkdir -p docs-checkout/fern/pages-dev
           rsync -a \
-            --exclude='assets' \
             --exclude='blogs' \
-            --exclude='diagrams' \
             --exclude='index.yml' \
             source-checkout/docs/ docs-checkout/fern/pages-dev/
 
           # Sync index.yml as versions/dev.yml and transform paths for docs-website layout
           echo "Syncing index.yml to docs-website branch as versions/dev.yml..."
           cp source-checkout/docs/index.yml docs-checkout/fern/versions/dev.yml
 
-          # Sync assets/ directory
-          echo "Syncing assets/ to docs-website branch..."
-          rm -rf docs-checkout/fern/assets
-          cp -r source-checkout/docs/assets docs-checkout/fern/assets
-
-          # Symlink assets into pages-dev/ so relative image paths in markdown resolve correctly
-          # (e.g. ../../assets/img/foo.png from pages-dev/observability/metrics.md)
-          ln -sfn ../assets docs-checkout/fern/pages-dev/assets
-
           # Sync fern.config.json
           echo "Syncing fern.config.json to docs-website branch..."
           cp source-checkout/fern/fern.config.json docs-checkout/fern/fern.config.json
 
@@ -197,7 +197,10 @@ jobs:
       run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
       single_gpu_test_markers: 'pre_merge and vllm and gpu_1'
       single_gpu_test_timeout_minutes: 35
-      run_multi_gpu_tests: false  # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
+      run_multi_gpu_tests: false  # TODO: select multi-GPU tests based for pre_merge from post_merge and anable below lines.
+      # run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
+      # multi_gpu_test_markers: 'pre_merge and vllm and gpu_2'
+      # multi_gpu_test_timeout_minutes: 30
     secrets: inherit
 
   # ============================================================================
 
@@ -49,7 +49,7 @@ jobs:
             exit 1
           fi
 
-          if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+          if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+ ]]; then
             echo "Error: workflow_dispatch must be triggered from a release/* branch"
             echo "Current branch: $BRANCH_NAME"
             exit 1
 
@@ -80,6 +80,7 @@ derive-getters = { version = "0.5" }
 either = { version = "1.13", features = ["serde"] }
 etcd-client = { version = "0.17.0", features = ["tls"] }
 futures = { version = "0.3" }
+futures-util = { version = "0.3.32" }
 hf-hub = { version = "0.4.2", default-features = false, features = [
     "tokio",
     "rustls-tls",
 
@@ -97,13 +97,13 @@ Containers have all dependencies pre-installed. No setup required.
 
 ```bash
 # SGLang
-docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.8.1
+docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.0.0
 
 # TensorRT-LLM
-docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1
+docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.0
 
 # vLLM
-docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1
+docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.0.0
 ```
 
 > **Tip:** To run frontend and worker in the same container, either run processes in background with `&` (see below), or open a second terminal and use `docker exec -it <container_id> bash`.
 
@@ -35,6 +35,7 @@
     "router_event_threads",
     "router_enable_cache_control",
     "router_queue_policy",
+    "remote_indexer_component",
 )
 
 
@@ -58,6 +59,7 @@ class KvRouterConfigBase(ConfigBase):
     router_event_threads: int
     router_enable_cache_control: bool
     router_queue_policy: str
+    remote_indexer_component: Optional[str]
 
     def kv_router_kwargs(self) -> dict:
         """Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
@@ -269,3 +271,15 @@ def add_arguments(self, parser) -> None:
             arg_type=str,
             choices=["fcfs", "wspt"],
         )
+        add_argument(
+            g,
+            flag_name="--remote-indexer-component",
+            env_var="DYN_REMOTE_INDEXER_COMPONENT",
+            default=None,
+            help=(
+                "[EXPERIMENTAL] KV Router: Component name of a standalone KV indexer to use for overlap scoring. "
+                "When set, the router queries the standalone indexer via the request plane instead "
+                "of maintaining a local radix tree (e.g. 'kv-indexer')."
+            ),
+            arg_type=str,
+        )
@@ -23,6 +23,7 @@ The mocker engine now supports a vLLM-style CLI interface with individual argume
 - `--enable-chunked-prefill` / `--no-enable-chunked-prefill`: Enable/disable chunked prefill (default: True)
 - `--preemption-mode`: Preemption mode for decode eviction under memory pressure: `lifo` (default, matches vLLM v1) or `fifo`
 - `--speedup-ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. Use `0` for infinite speedup (no simulation delays)
+- `--decode-speedup-ratio`: Additional speedup multiplier applied only to decode steps (default: 1.0). Models speculative decoding (e.g. Eagle) where decode throughput improves without affecting prefill latency. Effective decode speedup is `speedup_ratio * decode_speedup_ratio`
 - `--data-parallel-size`: Number of data parallel workers to simulate (default: 1)
 - `--num-workers`: Number of mocker workers to launch in the same process (default: 1). All workers share the same tokio runtime and thread pool
 - `--stagger-delay`: Delay in seconds between launching each worker to avoid overwhelming etcd/NATS/frontend. Set to 0 to disable staggering. Use -1 for auto mode (stagger dependent on number of workers). Default: -1 (auto)
 
@@ -109,6 +109,7 @@ def create_temp_engine_args_file(args: argparse.Namespace) -> Path:
         "enable_chunked_prefill": getattr(args, "enable_chunked_prefill", None),
         "preemption_mode": getattr(args, "preemption_mode", None),
         "speedup_ratio": getattr(args, "speedup_ratio", None),
+        "decode_speedup_ratio": getattr(args, "decode_speedup_ratio", None),
         "dp_size": getattr(args, "dp_size", None),
         "startup_time": getattr(args, "startup_time", None),
         "planner_profile_data": (
@@ -301,6 +302,14 @@ def parse_args() -> argparse.Namespace:
         default=None,
         help="Speedup ratio for mock execution (default: 1.0). Use 0 for infinite speedup (no simulation delays).",
     )
+    parser.add_argument(
+        "--decode-speedup-ratio",
+        type=float,
+        default=None,
+        help="Additional speedup multiplier applied only to decode steps (default: 1.0). "
+        "Models speculative decoding (e.g. Eagle) where decode throughput improves "
+        "without affecting prefill latency. Effective decode speedup is speedup_ratio * decode_speedup_ratio.",
+    )
     parser.add_argument(
         "--data-parallel-size",
         type=int,
@@ -462,6 +471,13 @@ def parse_args() -> argparse.Namespace:
         default=os.environ.get("DYN_REQUEST_PLANE", "tcp"),
         help="Determines how requests are distributed from routers to workers. 'tcp' is fastest [nats|http|tcp]",
     )
+    parser.add_argument(
+        "--event-plane",
+        type=str,
+        choices=["nats", "zmq"],
+        default=os.environ.get("DYN_EVENT_PLANE", "nats"),
+        help="Determines how events are published [nats|zmq]",
+    )
 
     args = parser.parse_args()
     validate_worker_type_args(args)
 
@@ -18,6 +18,7 @@
 
 os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
 
+from dynamo.common.utils.runtime import create_runtime
 from dynamo.llm import (
     EngineType,
     EntrypointArgs,
@@ -26,7 +27,6 @@
     make_engine,
     run_input,
 )
-from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging
 
 from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
@@ -193,7 +193,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
     - Independent service registration and stats scraping
     - But still sharing the same tokio runtime (efficient)
     """
-    loop = asyncio.get_running_loop()
     futures = []
     runtimes = []
     per_worker_temp_files: list[Path] = []
@@ -227,10 +226,12 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
         logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
 
         # Create a separate DistributedRuntime for this worker (on same event loop)
-        runtime = DistributedRuntime(
-            loop,
+
+        runtime, loop = create_runtime(
             args.discovery_backend,
             args.request_plane,
+            args.event_plane,
+            True,  # statically set to True, just determines to enable_nats if event_plane is nats
         )
         runtimes.append(runtime)
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,6 @@ jobs:`
`44`	`44`	`--host-concurrency 10`
`45`	`45`	`--host-request-interval 1s`
`46`	`46`	`--host-stats`
`47`		`- ${{ github.event_name == 'pull_request' && '--offline' \|\| '' }}`
`48`	`47`	`.`
`49`	`48`	`fail: true`
`50`	`49`	`env:`