Skip to content

Commit 59a9ecc

Browse files
committed
fix(kv-indexer): keep runtime in the maturin lane
Signed-off-by: PeaBrane <yanrpei@gmail.com>
2 parents c32605c + c7bac97 commit 59a9ecc

File tree

139 files changed

+3587
-839
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+3587
-839
lines changed

.github/workflows/docs-link-check.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ jobs:
4444
--host-concurrency 10
4545
--host-request-interval 1s
4646
--host-stats
47-
${{ github.event_name == 'pull_request' && '--offline' || '' }}
4847
.
4948
fail: true
5049
env:

.github/workflows/fern-docs.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -172,25 +172,14 @@ jobs:
172172
rm -rf docs-checkout/fern/pages-dev
173173
mkdir -p docs-checkout/fern/pages-dev
174174
rsync -a \
175-
--exclude='assets' \
176175
--exclude='blogs' \
177-
--exclude='diagrams' \
178176
--exclude='index.yml' \
179177
source-checkout/docs/ docs-checkout/fern/pages-dev/
180178
181179
# Sync index.yml as versions/dev.yml and transform paths for docs-website layout
182180
echo "Syncing index.yml to docs-website branch as versions/dev.yml..."
183181
cp source-checkout/docs/index.yml docs-checkout/fern/versions/dev.yml
184182
185-
# Sync assets/ directory
186-
echo "Syncing assets/ to docs-website branch..."
187-
rm -rf docs-checkout/fern/assets
188-
cp -r source-checkout/docs/assets docs-checkout/fern/assets
189-
190-
# Symlink assets into pages-dev/ so relative image paths in markdown resolve correctly
191-
# (e.g. ../../assets/img/foo.png from pages-dev/observability/metrics.md)
192-
ln -sfn ../assets docs-checkout/fern/pages-dev/assets
193-
194183
# Sync fern.config.json
195184
echo "Syncing fern.config.json to docs-website branch..."
196185
cp source-checkout/fern/fern.config.json docs-checkout/fern/fern.config.json

.github/workflows/pr.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,10 @@ jobs:
197197
run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
198198
single_gpu_test_markers: 'pre_merge and vllm and gpu_1'
199199
single_gpu_test_timeout_minutes: 35
200-
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
200+
run_multi_gpu_tests: false # TODO: select multi-GPU tests based for pre_merge from post_merge and anable below lines.
201+
# run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
202+
# multi_gpu_test_markers: 'pre_merge and vllm and gpu_2'
203+
# multi_gpu_test_timeout_minutes: 30
201204
secrets: inherit
202205

203206
# ============================================================================

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
exit 1
5050
fi
5151
52-
if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
52+
if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+ ]]; then
5353
echo "Error: workflow_dispatch must be triggered from a release/* branch"
5454
echo "Current branch: $BRANCH_NAME"
5555
exit 1

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ derive-getters = { version = "0.5" }
8080
either = { version = "1.13", features = ["serde"] }
8181
etcd-client = { version = "0.17.0", features = ["tls"] }
8282
futures = { version = "0.3" }
83+
futures-util = { version = "0.3.32" }
8384
hf-hub = { version = "0.4.2", default-features = false, features = [
8485
"tokio",
8586
"rustls-tls",

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ Containers have all dependencies pre-installed. No setup required.
9797

9898
```bash
9999
# SGLang
100-
docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.8.1
100+
docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.0.0
101101

102102
# TensorRT-LLM
103-
docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1
103+
docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.0
104104

105105
# vLLM
106-
docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1
106+
docker run --gpus all --network host --rm -it nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.0.0
107107
```
108108

109109
> **Tip:** To run frontend and worker in the same container, either run processes in background with `&` (see below), or open a second terminal and use `docker exec -it <container_id> bash`.

components/src/dynamo/common/configuration/groups/kv_router_args.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
"router_event_threads",
3636
"router_enable_cache_control",
3737
"router_queue_policy",
38+
"remote_indexer_component",
3839
)
3940

4041

@@ -58,6 +59,7 @@ class KvRouterConfigBase(ConfigBase):
5859
router_event_threads: int
5960
router_enable_cache_control: bool
6061
router_queue_policy: str
62+
remote_indexer_component: Optional[str]
6163

6264
def kv_router_kwargs(self) -> dict:
6365
"""Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
@@ -269,3 +271,15 @@ def add_arguments(self, parser) -> None:
269271
arg_type=str,
270272
choices=["fcfs", "wspt"],
271273
)
274+
add_argument(
275+
g,
276+
flag_name="--remote-indexer-component",
277+
env_var="DYN_REMOTE_INDEXER_COMPONENT",
278+
default=None,
279+
help=(
280+
"[EXPERIMENTAL] KV Router: Component name of a standalone KV indexer to use for overlap scoring. "
281+
"When set, the router queries the standalone indexer via the request plane instead "
282+
"of maintaining a local radix tree (e.g. 'kv-indexer')."
283+
),
284+
arg_type=str,
285+
)

components/src/dynamo/mocker/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ The mocker engine now supports a vLLM-style CLI interface with individual argume
2323
- `--enable-chunked-prefill` / `--no-enable-chunked-prefill`: Enable/disable chunked prefill (default: True)
2424
- `--preemption-mode`: Preemption mode for decode eviction under memory pressure: `lifo` (default, matches vLLM v1) or `fifo`
2525
- `--speedup-ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster. Use `0` for infinite speedup (no simulation delays)
26+
- `--decode-speedup-ratio`: Additional speedup multiplier applied only to decode steps (default: 1.0). Models speculative decoding (e.g. Eagle) where decode throughput improves without affecting prefill latency. Effective decode speedup is `speedup_ratio * decode_speedup_ratio`
2627
- `--data-parallel-size`: Number of data parallel workers to simulate (default: 1)
2728
- `--num-workers`: Number of mocker workers to launch in the same process (default: 1). All workers share the same tokio runtime and thread pool
2829
- `--stagger-delay`: Delay in seconds between launching each worker to avoid overwhelming etcd/NATS/frontend. Set to 0 to disable staggering. Use -1 for auto mode (stagger dependent on number of workers). Default: -1 (auto)

components/src/dynamo/mocker/args.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def create_temp_engine_args_file(args: argparse.Namespace) -> Path:
109109
"enable_chunked_prefill": getattr(args, "enable_chunked_prefill", None),
110110
"preemption_mode": getattr(args, "preemption_mode", None),
111111
"speedup_ratio": getattr(args, "speedup_ratio", None),
112+
"decode_speedup_ratio": getattr(args, "decode_speedup_ratio", None),
112113
"dp_size": getattr(args, "dp_size", None),
113114
"startup_time": getattr(args, "startup_time", None),
114115
"planner_profile_data": (
@@ -301,6 +302,14 @@ def parse_args() -> argparse.Namespace:
301302
default=None,
302303
help="Speedup ratio for mock execution (default: 1.0). Use 0 for infinite speedup (no simulation delays).",
303304
)
305+
parser.add_argument(
306+
"--decode-speedup-ratio",
307+
type=float,
308+
default=None,
309+
help="Additional speedup multiplier applied only to decode steps (default: 1.0). "
310+
"Models speculative decoding (e.g. Eagle) where decode throughput improves "
311+
"without affecting prefill latency. Effective decode speedup is speedup_ratio * decode_speedup_ratio.",
312+
)
304313
parser.add_argument(
305314
"--data-parallel-size",
306315
type=int,
@@ -462,6 +471,13 @@ def parse_args() -> argparse.Namespace:
462471
default=os.environ.get("DYN_REQUEST_PLANE", "tcp"),
463472
help="Determines how requests are distributed from routers to workers. 'tcp' is fastest [nats|http|tcp]",
464473
)
474+
parser.add_argument(
475+
"--event-plane",
476+
type=str,
477+
choices=["nats", "zmq"],
478+
default=os.environ.get("DYN_EVENT_PLANE", "nats"),
479+
help="Determines how events are published [nats|zmq]",
480+
)
465481

466482
args = parser.parse_args()
467483
validate_worker_type_args(args)

components/src/dynamo/mocker/main.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
2020

21+
from dynamo.common.utils.runtime import create_runtime
2122
from dynamo.llm import (
2223
EngineType,
2324
EntrypointArgs,
@@ -26,7 +27,6 @@
2627
make_engine,
2728
run_input,
2829
)
29-
from dynamo.runtime import DistributedRuntime
3030
from dynamo.runtime.logging import configure_dynamo_logging
3131

3232
from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
@@ -193,7 +193,6 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
193193
- Independent service registration and stats scraping
194194
- But still sharing the same tokio runtime (efficient)
195195
"""
196-
loop = asyncio.get_running_loop()
197196
futures = []
198197
runtimes = []
199198
per_worker_temp_files: list[Path] = []
@@ -227,10 +226,12 @@ async def launch_workers(args: argparse.Namespace, extra_engine_args_path: Path)
227226
logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
228227

229228
# Create a separate DistributedRuntime for this worker (on same event loop)
230-
runtime = DistributedRuntime(
231-
loop,
229+
230+
runtime, loop = create_runtime(
232231
args.discovery_backend,
233232
args.request_plane,
233+
args.event_plane,
234+
True, # statically set to True, just determines to enable_nats if event_plane is nats
234235
)
235236
runtimes.append(runtime)
236237

0 commit comments

Comments
 (0)