modal-labs · jvmncs · Jul 16, 2025 · Jul 16, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/k2-inference/README.md b/k2-inference/README.md
@@ -0,0 +1,82 @@
+# K2 Inference: Kimi K2 Multinode Inference
+
+This example demonstrates distributed vLLM inference for Moonshot AI's Kimi-K2-Instruct model across multiple GPU nodes using Ray orchestration on Modal.
+
+## Overview
+
+The setup runs Kimi-K2-Instruct with:
+- 4 nodes with 8x H100 GPUs each (32 H100s total)
+- Tensor parallel size: 16, Pipeline parallel size: 2
+- RDMA networking for high-performance inter-node communication
+- Ray for distributed orchestration
+- vLLM nightly build for Kimi-K2-Instruct pipeline parallelism support
+
+## Usage
+
+**Run the inference server:**
+
+```bash
+modal deploy modal_infer.py
+```
+
+This will start a vLLM server accessible at port 8000 on the head node, exposed via the Flash URL reported by the CLI output. This particular configuration shards the model with 8-way tensor parallelism and 4-way pipeline parallelism, see `main.py` for other options.
+
+**Curl the web endpoint:**
+
+```console
+curl -X POST https://{WORKSPACE}-{ENVIRONMENT}--k2-multinode-inference-k2tp8pp4ep-dev.modal.run/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "kimi-k2",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello! How are you today?"
+      }
+    ],
+    "temperature": 0.7,
+    "max_tokens": 150
+  }'
+```
+
+### Configuration
+
+The example is pre-configured for Kimi-K2-Instruct with:
+- Model: `moonshotai/Kimi-K2-Instruct`
+- Context length: 128,000 tokens
+- Max sequences: 256
+- Tensor parallel: 8 GPUs
+- Pipeline parallel: 4 nodes
+
+To modify these settings, inherit from `K2Inference` and set the `tp_size`, `pp_size`, `dp_size`, `nodes` `max_seqs`, `max_model_len`, and `enable_expert_parallel` class attributes. See `alt_deployments/k2_pp2.py` for an example.
+
+## Load Testing
+
+Test performance and identify bottlenecks using Locust:
+
+```bash
+# Basic load test
+modal run load_test.py --target-url https://your-deployment.modal.run
+
+# High-load distributed test
+modal run load_test.py --target-url https://your-deployment.modal.run \
+  --distributed --workers 8 --users 1000 --time 15m
+```
+
+**Parameters:**
+- `--users`: Concurrent users (default: 100)
+- `--spawn-rate`: Users/second spawn rate (default: 10) 
+- `--time`: Test duration, e.g. "5m", "2h" (default: 5m)
+- `--distributed`: Enable multi-worker testing
+- `--workers`: Worker processes for distributed tests (default: 4)
+
+**Endpoints tested:**
+- `/v1/chat/completions` (standard + streaming)
+- `/v1/models` 
+- `/health`
+
+Results auto-saved to Modal volume `k2-loadtest-results` with CSV stats, HTML reports, and logs. Expected baseline: ~40 tokens/s single request, scales with `max_seqs=256`.
diff --git a/k2-inference/alt_deployments/k2_dp2.py b/k2-inference/alt_deployments/k2_dp2.py
@@ -0,0 +1,33 @@
+import pathlib
+import modal
+import modal.experimental
+from inference import K2Inference, app, hf_cache_volume, image, vllm_cache_volume
+
+local_path = pathlib.Path(__file__).parent.parent / "inference.py"
+image = image.add_local_file(local_path, "/root/inference.py")
+
+
+@app.cls(
+    image=image,
+    gpu="H100:8",
+    volumes={
+        "/root/.cache/huggingface": hf_cache_volume,
+        "/root/.cache/vllm": vllm_cache_volume,
+    },
+    timeout=60 * 60 * 1,
+    min_containers=1,
+    experimental_options={"flash": "us-east"},
+)
+@modal.experimental.clustered(size=4, rdma=True)
+class K2Tp8Dp2Ep(K2Inference):
+    # 4x8H100
+    # tp=8,pp=1,dp=2,ep=tp*dp=16
+    # single request decodes at ~20 tokens/s
+    # trading more comm for less risk of pipeline bubbles
+    tp_size = 8
+    pp_size = 1
+    dp_size = 2
+    nodes = 4
+    max_seqs = 8
+    max_model_len = 64000
+    enable_expert_parallel = True
diff --git a/k2-inference/alt_deployments/k2_pp2.py b/k2-inference/alt_deployments/k2_pp2.py
@@ -0,0 +1,33 @@
+import pathlib
+import modal
+import modal.experimental
+from inference import K2Inference, app, hf_cache_volume, image, vllm_cache_volume
+
+local_path = pathlib.Path(__file__).parent.parent / "inference.py"
+image = image.add_local_file(local_path, "/root/inference.py")
+
+
+@app.cls(
+    image=image,
+    gpu="H100:8",
+    volumes={
+        "/root/.cache/huggingface": hf_cache_volume,
+        "/root/.cache/vllm": vllm_cache_volume,
+    },
+    timeout=60 * 60 * 1,
+    min_containers=1,
+    experimental_options={"flash": "us-east"},
+)
+@modal.experimental.clustered(size=4, rdma=True)
+class K2Tp16Pp2Ep(K2Inference):
+    # 4x8H100
+    # tp=ep=16,pp=2,dp=1
+    # single request decodes at ~20 tokens/s
+    # trading more comm for less risk of pipeline bubbles
+    tp_size = 16
+    pp_size = 2
+    dp_size = 1
+    nodes = 4
+    max_seqs = 256
+    max_model_len = 128000
+    enable_expert_parallel = True