bytedance-iaas
diff --git a/‎Cargo.lock‎
Lines changed: 2 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/guides/dynamo_run.md‎
Lines changed: 8 additions & 8 deletions b/‎docs/guides/dynamo_run.md‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎launch/dynamo-run/Cargo.toml‎
Lines changed: 2 additions & 0 deletions b/‎launch/dynamo-run/Cargo.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎launch/dynamo-run/src/input/batch.rs‎
Lines changed: 1 addition & 0 deletions b/‎launch/dynamo-run/src/input/batch.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎launch/dynamo-run/src/input/http.rs‎
Lines changed: 3 additions & 1 deletion b/‎launch/dynamo-run/src/input/http.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎launch/dynamo-run/src/input/text.rs‎
Lines changed: 1 addition & 0 deletions b/‎launch/dynamo-run/src/input/text.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 112 additions & 11 deletions b/‎launch/dynamo-run/src/lib.rs‎
Lines changed: 112 additions & 11 deletions
@@ -59,6 +59,7 @@ etcd-client = { version = "0.14" }
 futures = { version = "0.3" }
 hf-hub = { version = "0.4.2", default-features = false, features = ["tokio", "rustls-tls"] }
 humantime = { version = "2.2.0" }
+libc = { version = "0.2" }
 prometheus = { version = "0.14" }
 rand = { version = "0.9.0" }
 serde = { version = "1", features = ["derive"] }
 
@@ -239,16 +239,16 @@ Inside that virtualenv:
 ./dynamo-run in=http out=vllm ~/llm_models/Llama-3.2-3B-Instruct-Q6_K.gguf
 ```
 
+Note that vllm GGUF handling is very slow. Prefer llamacpp.
+
 **Multi-node:**
-**Node 1:**
-```
-dynamo-run in=text out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --tensor-parallel-size 8 --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 0
-```
 
-**Node 2:**
-```
-dynamo-run in=none out=vllm ~/llm_models/Llama-3.2-3B-Instruct/ --num-nodes 2 --leader-addr 10.217.98.122:6539 --node-rank 1
-```
+vllm uses [ray](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#running-vllm-on-multiple-nodes) for pipeline parallel inference. Dynamo does not change or manage that.
+
+Head node (the one running `dynamo-run`): `ray start --head --port=6379 --dashboard-host 0.0.0.0`
+Each worker node: `ray start --address='<HEAD_NODE_IP>:6379`
+
+Remember to pass dynamo-run `--tensor-parallel-size <total-gpus-across-cluster>`, which is often constrained by a model dimension such as being a divisor of the number of attention heads.
 
 To pass extra arguments to the vllm engine see [Extra engine arguments](#extra_engine_arguments) below.
 
 
@@ -54,6 +54,7 @@ async-stream = { workspace = true }
 async-trait = { workspace = true }
 futures = { workspace = true }
 humantime = { workspace = true }
+libc = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 tempfile = { workspace = true }
@@ -66,6 +67,7 @@ async-openai = { version = "0.27.2" }
 clap = { version = "4.5", features = ["derive", "env"] }
 dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
 futures-util = { version = "0.3" }
+regex = "1"
 
 [target.x86_64-unknown-linux-gnu.dependencies]
 netlink-packet-route = { version = "0.19", optional = true }
 
@@ -202,6 +202,7 @@ pub async fn run(
         tokens_out,
         tokens_out / cmp::max(elapsed.as_secs(), 1),
     );
+    cancel_token.cancel(); // stop everything else
 
     Ok(())
 }
 
@@ -99,7 +99,9 @@ pub async fn run(
         }
         EngineConfig::None => unreachable!(),
     }
-    http_service.run(runtime.primary_token()).await
+    http_service.run(runtime.primary_token()).await?;
+    runtime.shutdown(); // Cancel primary token
+    Ok(())
 }
 
 /// Spawns a task that watches for new models in etcd at network_prefix,
 
@@ -191,6 +191,7 @@ async fn main_loop(
             break;
         }
     }
+    cancel_token.cancel(); // stop everything else
     println!();
     Ok(())
 }
@@ -15,14 +15,14 @@
 
 #[cfg(any(feature = "vllm", feature = "sglang"))]
 use std::{future::Future, pin::Pin};
-use std::{io::Read, sync::Arc};
+use std::{io::Read, sync::Arc, time::Duration};
 
 use anyhow::Context;
 use dynamo_llm::{
     backend::ExecutionContext, engines::StreamingEngine, kv_router::publisher::KvMetricsPublisher,
     LocalModel,
 };
-use dynamo_runtime::{protocols::Endpoint, DistributedRuntime};
+use dynamo_runtime::{protocols::Endpoint, CancellationToken, DistributedRuntime};
 
 mod flags;
 pub use flags::Flags;
@@ -32,11 +32,7 @@ mod net;
 mod opt;
 pub use dynamo_llm::request_template::RequestTemplate;
 pub use opt::{Input, Output};
-
-/// How we identify a namespace/component/endpoint URL.
-/// Technically the '://' is not part of the scheme but it eliminates several string
-/// concatenations.
-const ENDPOINT_SCHEME: &str = "dyn://";
+mod subprocess;
 
 /// When `in=text` the user doesn't need to know the model name, and doesn't need to provide it on
 /// the command line. Hence it's optional, and defaults to this.
@@ -45,6 +41,8 @@ const INVISIBLE_MODEL_NAME: &str = "dynamo-run";
 /// The component name for the KV publisher, if used
 const KV_PUBLISHER_COMPONENT: &str = "kvpublisher";
 
+const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
+
 /// How we identify a python string endpoint
 #[cfg(feature = "python")]
 const PYTHON_STR_SCHEME: &str = "pystr:";
@@ -97,6 +95,8 @@ pub async fn run(
         // If output is an endpoint we are ingress and don't have a local model, but making an
         // empty one cleans up the code.
         Output::Endpoint(_) => Default::default(),
+
+        // All other output types have a local model
         _ => {
             match &maybe_path {
                 Some(model_path) => {
@@ -143,7 +143,6 @@ pub async fn run(
         _ => None,
     };
 
-    #[cfg(any(feature = "vllm", feature = "sglang"))]
     let mut extra: Option<Pin<Box<dyn Future<Output = ()> + Send>>> = None; // vllm and sglang sub-process
 
     let template = if let Some(path) = flags.request_template.as_ref() {
@@ -184,8 +183,42 @@ pub async fn run(
             engine: dynamo_engine_mistralrs::make_engine(local_model.path()).await?,
             model: Box::new(local_model),
         },
-        #[cfg(feature = "sglang")]
+
         Output::SgLang => {
+            if !local_model.path().is_dir() {
+                // TODO Does sglang support GGUF? Can we make it work?
+                anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
+            }
+            let (py_script, mut child) = match subprocess::start(
+                subprocess::sglang::PY,
+                local_model.path(),
+                flags.tensor_parallel_size,
+                if flags.base_gpu_id == 0 {
+                    None
+                } else {
+                    Some(flags.base_gpu_id)
+                },
+                flags.extra_engine_args.as_deref(),
+            )
+            .await
+            {
+                Ok(x) => x,
+                Err(err) => {
+                    anyhow::bail!("Failed starting sglang sub-process: {err}");
+                }
+            };
+            let cancel_token = cancel_token.clone();
+
+            // Sub-process cleanup
+            extra = Some(Box::pin(async move {
+                stopper(cancel_token, child, py_script).await;
+            }));
+            let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
+            EngineConfig::Dynamic(endpoint)
+        }
+
+        #[cfg(feature = "sglang")]
+        Output::SgLangLegacy => {
             if !local_model.path().is_dir() {
                 anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
             }
@@ -295,7 +328,7 @@ pub async fn run(
         }
 
         #[cfg(feature = "vllm")]
-        Output::Vllm | Output::Vllm0_8 => {
+        Output::Vllm0_8 => {
             if flags.base_gpu_id != 0 {
                 anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
             }
@@ -318,6 +351,35 @@ pub async fn run(
             }
         }
 
+        // No feature flag because it uses a sub-process, it's very cheap to include
+        Output::Vllm => {
+            if flags.base_gpu_id != 0 {
+                anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
+            }
+            let (py_script, mut child) = match subprocess::start(
+                subprocess::vllm::PY,
+                local_model.path(),
+                flags.tensor_parallel_size,
+                None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
+                flags.extra_engine_args.as_deref(),
+            )
+            .await
+            {
+                Ok(x) => x,
+                Err(err) => {
+                    anyhow::bail!("Failed starting vllm sub-process: {err}");
+                }
+            };
+            let cancel_token = cancel_token.clone();
+
+            // Sub-process cleanup
+            extra = Some(Box::pin(async move {
+                stopper(cancel_token, child, py_script).await;
+            }));
+            let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
+            EngineConfig::Dynamic(endpoint)
+        }
+
         #[cfg(feature = "llamacpp")]
         Output::LlamaCpp => {
             if !local_model.path().is_file() {
@@ -394,11 +456,50 @@ pub async fn run(
         }
     }
 
-    #[cfg(any(feature = "vllm", feature = "sglang"))]
     // Allow engines to ask main thread to wait on an extra future.
+    // We use this to stop the vllm and sglang sub-process
     if let Some(extra) = extra {
         extra.await;
     }
 
     Ok(())
 }
+
+/// Wait for cancel_token to be cancelled, then stop the child as gracefully as possible.
+/// Keeps the TempPath alive until the child is stopped.
+async fn stopper(
+    cancel_token: CancellationToken,
+    mut child: tokio::process::Child,
+    py_script: tempfile::TempPath,
+) {
+    cancel_token.cancelled().await;
+
+    // Ask subprocess to stop gracefully
+    if let Some(pid) = child.id() {
+        unsafe { libc::kill(pid as i32, libc::SIGTERM) };
+    }
+
+    tokio::select! {
+        exit = child.wait() => {
+            tracing::trace!("vllm sub-process graceful exit");
+            match exit {
+                Ok(exit_status) if exit_status.success() => {}
+                Ok(exit_status) => {
+                    // This is nearly always 15 (SIGTERM)
+                    tracing::trace!("vllm sub-process non-0 exit: {exit_status}");
+                }
+                Err(err) => {
+                    tracing::warn!("vllm sub-process error getting exit status: {err}");
+                }
+            }
+        }
+        _ = tokio::time::sleep(CHILD_STOP_TIMEOUT) => {
+            // It didn't stop in time, kill it
+            child.kill().await.expect("Failed killing vllm subprocess");
+            let _ = child.wait().await;
+        }
+    }
+    // This temporary file contains the python script running the engine. It deletes on drop.
+    // Keep it alive until the engine has stopped.
+    drop(py_script);
+}
Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,7 @@ pub async fn run(`
`202`	`202`	`tokens_out,`
`203`	`203`	`tokens_out / cmp::max(elapsed.as_secs(), 1),`
`204`	`204`	`);`
	`205`	`+ cancel_token.cancel(); // stop everything else`
`205`	`206`
`206`	`207`	`Ok(())`
`207`	`208`	`}`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,9 @@ pub async fn run(`
`99`	`99`	`}`
`100`	`100`	`EngineConfig::None => unreachable!(),`
`101`	`101`	`}`
`102`		`- http_service.run(runtime.primary_token()).await`
	`102`	`+ http_service.run(runtime.primary_token()).await?;`
	`103`	`+ runtime.shutdown(); // Cancel primary token`
	`104`	`+ Ok(())`
`103`	`105`	`}`
`104`	`106`
`105`	`107`	`/// Spawns a task that watches for new models in etcd at network_prefix,`
Original file line number	Diff line number	Diff line change
`@@ -191,6 +191,7 @@ async fn main_loop(`
`191`	`191`	`break;`
`192`	`192`	`}`
`193`	`193`	`}`
	`194`	`+ cancel_token.cancel(); // stop everything else`
`194`	`195`	`println!();`
`195`	`196`	`Ok(())`
`196`	`197`	`}`