Skip to content

Commit 28fd481

Browse files
authored
feat(dynamo-run): vllm and sglang subprocess engines (ai-dynamo#954)
New vllm and sglang engines that run in a sub-process. Will hopefully replace the existing embedded python engines. Why? - Pure Python, does not require knowing Rust to work on it. Much simpler to maintain. - No embedded Python interpreter which avoids linking libpython and avoids the MacOS virtualenv issues. - Should have better performance as it's "native" vllm / sglang. - Works with any version of vllm (including v1!) and sglang. Less upgrade struggle.
1 parent 9f0e12a commit 28fd481

File tree

17 files changed

+926
-52
lines changed

17 files changed

+926
-52
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ etcd-client = { version = "0.14" }
5959
futures = { version = "0.3" }
6060
hf-hub = { version = "0.4.2", default-features = false, features = ["tokio", "rustls-tls"] }
6161
humantime = { version = "2.2.0" }
62+
libc = { version = "0.2" }
6263
prometheus = { version = "0.14" }
6364
rand = { version = "0.9.0" }
6465
serde = { version = "1", features = ["derive"] }

docs/guides/dynamo_run.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -239,16 +239,16 @@ Inside that virtualenv:
239239
./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
240240
```
241241
242+
Note that vllm GGUF handling is very slow. Prefer llamacpp.
243+
242244
**Multi-node:**
243-
**Node 1:**
244-
```
245-
dynamo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
246-
```
247245
248-
**Node 2:**
249-
```
250-
dynamo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
251-
```
246+
vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
247+
248+
Head node (the one running `dynamo-run`): `ray start --head --port=6379 --dashboard-host 0.0.0.0`
249+
Each worker node: `ray start --address='<HEAD_NODE_IP>:6379`
250+
251+
Remember to pass dynamo-run `--tensor-parallel-size <total-gpus-across-cluster>`, which is often constrained by a model dimension such as being a divisor of the number of attention heads.
252252
253253
To pass extra arguments to the vllm engine see [Extra engine arguments](#extra_engine_arguments) below.
254254

launch/dynamo-run/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ async-stream = { workspace = true }
5454
async-trait = { workspace = true }
5555
futures = { workspace = true }
5656
humantime = { workspace = true }
57+
libc = { workspace = true }
5758
serde = { workspace = true }
5859
serde_json = { workspace = true }
5960
tempfile = { workspace = true }
@@ -66,6 +67,7 @@ async-openai = { version = "0.27.2" }
6667
clap = { version = "4.5", features = ["derive", "env"] }
6768
dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
6869
futures-util = { version = "0.3" }
70+
regex = "1"
6971

7072
[target.x86_64-unknown-linux-gnu.dependencies]
7173
netlink-packet-route = { version = "0.19", optional = true }

launch/dynamo-run/src/input/batch.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ pub async fn run(
202202
tokens_out,
203203
tokens_out / cmp::max(elapsed.as_secs(), 1),
204204
);
205+
cancel_token.cancel(); // stop everything else
205206

206207
Ok(())
207208
}

launch/dynamo-run/src/input/http.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,9 @@ pub async fn run(
9999
}
100100
EngineConfig::None => unreachable!(),
101101
}
102-
http_service.run(runtime.primary_token()).await
102+
http_service.run(runtime.primary_token()).await?;
103+
runtime.shutdown(); // Cancel primary token
104+
Ok(())
103105
}
104106

105107
/// Spawns a task that watches for new models in etcd at network_prefix,

launch/dynamo-run/src/input/text.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ async fn main_loop(
191191
break;
192192
}
193193
}
194+
cancel_token.cancel(); // stop everything else
194195
println!();
195196
Ok(())
196197
}

launch/dynamo-run/src/lib.rs

Lines changed: 112 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515

1616
#[cfg(any(feature = "vllm", feature = "sglang"))]
1717
use std::{future::Future, pin::Pin};
18-
use std::{io::Read, sync::Arc};
18+
use std::{io::Read, sync::Arc, time::Duration};
1919

2020
use anyhow::Context;
2121
use dynamo_llm::{
2222
backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
2323
LocalModel,
2424
};
25-
use dynamo_runtime::{protocols::Endpoint, DistributedRuntime};
25+
use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime};
2626

2727
mod flags;
2828
pub use flags::Flags;
@@ -32,11 +32,7 @@ mod net;
3232
mod opt;
3333
pub use dynamo_llm::request_template::RequestTemplate;
3434
pub use opt::{Input, Output};
35-
36-
/// How we identify a namespace/component/endpoint URL.
37-
/// Technically the '://' is not part of the scheme but it eliminates several string
38-
/// concatenations.
39-
const ENDPOINT_SCHEME: &str = "dyn://";
35+
mod subprocess;
4036

4137
/// When `in=text` the user doesn't need to know the model name, and doesn't need to provide it on
4238
/// the command line. Hence it's optional, and defaults to this.
@@ -45,6 +41,8 @@ const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
4541
/// The component name for the KV publisher, if used
4642
const KV_PUBLISHER_COMPONENT: &str = "kvpublisher";
4743

44+
const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
45+
4846
/// How we identify a python string endpoint
4947
#[cfg(feature = "python")]
5048
const PYTHON_STR_SCHEME: &str = "pystr:";
@@ -97,6 +95,8 @@ pub async fn run(
9795
// If output is an endpoint we are ingress and don't have a local model, but making an
9896
// empty one cleans up the code.
9997
Output::Endpoint(_) => Default::default(),
98+
99+
// All other output types have a local model
100100
_ => {
101101
match &maybe_path {
102102
Some(model_path) => {
@@ -143,7 +143,6 @@ pub async fn run(
143143
_ => None,
144144
};
145145

146-
#[cfg(any(feature = "vllm", feature = "sglang"))]
147146
let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process
148147

149148
let template = if let Some(path) = flags.request_template.as_ref() {
@@ -184,8 +183,42 @@ pub async fn run(
184183
engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
185184
model: Box::new(local_model),
186185
},
187-
#[cfg(feature = "sglang")]
186+
188187
Output::SgLang => {
188+
if !local_model.path().is_dir() {
189+
// TODO Does sglang support GGUF? Can we make it work?
190+
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
191+
}
192+
let (py_script, mut child) = match subprocess::start(
193+
subprocess::sglang::PY,
194+
local_model.path(),
195+
flags.tensor_parallel_size,
196+
if flags.base_gpu_id == 0 {
197+
None
198+
} else {
199+
Some(flags.base_gpu_id)
200+
},
201+
flags.extra_engine_args.as_deref(),
202+
)
203+
.await
204+
{
205+
Ok(x) => x,
206+
Err(err) => {
207+
anyhow::bail!("Failed starting sglang sub-process: {err}");
208+
}
209+
};
210+
let cancel_token = cancel_token.clone();
211+
212+
// Sub-process cleanup
213+
extra = Some(Box::pin(async move {
214+
stopper(cancel_token, child, py_script).await;
215+
}));
216+
let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
217+
EngineConfig::Dynamic(endpoint)
218+
}
219+
220+
#[cfg(feature = "sglang")]
221+
Output::SgLangLegacy => {
189222
if !local_model.path().is_dir() {
190223
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
191224
}
@@ -295,7 +328,7 @@ pub async fn run(
295328
}
296329

297330
#[cfg(feature = "vllm")]
298-
Output::Vllm | Output::Vllm0_8 => {
331+
Output::Vllm0_8 => {
299332
if flags.base_gpu_id != 0 {
300333
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
301334
}
@@ -318,6 +351,35 @@ pub async fn run(
318351
}
319352
}
320353

354+
// No feature flag because it uses a sub-process, it's very cheap to include
355+
Output::Vllm => {
356+
if flags.base_gpu_id != 0 {
357+
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
358+
}
359+
let (py_script, mut child) = match subprocess::start(
360+
subprocess::vllm::PY,
361+
local_model.path(),
362+
flags.tensor_parallel_size,
363+
None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
364+
flags.extra_engine_args.as_deref(),
365+
)
366+
.await
367+
{
368+
Ok(x) => x,
369+
Err(err) => {
370+
anyhow::bail!("Failed starting vllm sub-process: {err}");
371+
}
372+
};
373+
let cancel_token = cancel_token.clone();
374+
375+
// Sub-process cleanup
376+
extra = Some(Box::pin(async move {
377+
stopper(cancel_token, child, py_script).await;
378+
}));
379+
let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
380+
EngineConfig::Dynamic(endpoint)
381+
}
382+
321383
#[cfg(feature = "llamacpp")]
322384
Output::LlamaCpp => {
323385
if !local_model.path().is_file() {
@@ -394,11 +456,50 @@ pub async fn run(
394456
}
395457
}
396458

397-
#[cfg(any(feature = "vllm", feature = "sglang"))]
398459
// Allow engines to ask main thread to wait on an extra future.
460+
// We use this to stop the vllm and sglang sub-process
399461
if let Some(extra) = extra {
400462
extra.await;
401463
}
402464

403465
Ok(())
404466
}
467+
468+
/// Wait for cancel_token to be cancelled, then stop the child as gracefully as possible.
469+
/// Keeps the TempPath alive until the child is stopped.
470+
async fn stopper(
471+
cancel_token: CancellationToken,
472+
mut child: tokio::process::Child,
473+
py_script: tempfile::TempPath,
474+
) {
475+
cancel_token.cancelled().await;
476+
477+
// Ask subprocess to stop gracefully
478+
if let Some(pid) = child.id() {
479+
unsafe { libc::kill(pid as i32, libc::SIGTERM) };
480+
}
481+
482+
tokio::select! {
483+
exit = child.wait() => {
484+
tracing::trace!("vllm sub-process graceful exit");
485+
match exit {
486+
Ok(exit_status) if exit_status.success() => {}
487+
Ok(exit_status) => {
488+
// This is nearly always 15 (SIGTERM)
489+
tracing::trace!("vllm sub-process non-0 exit: {exit_status}");
490+
}
491+
Err(err) => {
492+
tracing::warn!("vllm sub-process error getting exit status: {err}");
493+
}
494+
}
495+
}
496+
_ = tokio::time::sleep(CHILD_STOP_TIMEOUT) => {
497+
// It didn't stop in time, kill it
498+
child.kill().await.expect("Failed killing vllm subprocess");
499+
let _ = child.wait().await;
500+
}
501+
}
502+
// This temporary file contains the python script running the engine. It deletes on drop.
503+
// Keep it alive until the engine has stopped.
504+
drop(py_script);
505+
}

0 commit comments

Comments
 (0)