Skip to content

Commit 1b4478a

Browse files
committed
add --kt-numa-nodes CLI parameter for explicit NUMA node mapping
Add --kt-numa-nodes parameter to ServerArgs and thread it through KTConfig to KTMoEWrapper. This allows users to specify which NUMA node IDs to bind to, enabling multi-instance deployment on different NUMA nodes without external numactl workarounds. Usage: --kt-threadpool-count 1 --kt-numa-nodes 1 (binds to NUMA node 1 instead of defaulting to node 0) Companion to kvcache-ai/ktransformers#1891
1 parent f6adb4f commit 1b4478a

File tree

2 files changed

+13
-0
lines changed

2 files changed

+13
-0
lines changed

python/sglang/srt/layers/moe/kt_ep_wrapper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class KTConfig:
8282
num_layers: Optional[int] = None
8383
gpu_prefill_token_threshold: Optional[int] = None
8484
kt_enable_dynamic_expert_update: bool = False
85+
numa_nodes: Optional[List[int]] = None
8586

8687

8788
_SHARED_FULL_CONTEXT = None
@@ -1667,6 +1668,7 @@ def create_kt_config_from_server_args(
16671668
num_layers=num_layers,
16681669
gpu_prefill_token_threshold=server_args.kt_gpu_prefill_token_threshold,
16691670
kt_enable_dynamic_expert_update=server_args.kt_enable_dynamic_expert_update,
1671+
numa_nodes=[int(x) for x in server_args.kt_numa_nodes.split(",")] if server_args.kt_numa_nodes else None,
16701672
)
16711673

16721674

@@ -2105,6 +2107,7 @@ def create_weights(
21052107
chunked_prefill_size=self.kt_config.chunked_prefill_size,
21062108
method=self.kt_config.method,
21072109
max_deferred_experts_per_token=layer_max_deferred,
2110+
numa_nodes=self.kt_config.numa_nodes,
21082111
)
21092112

21102113
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

python/sglang/srt/server_args.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ class ServerArgs:
554554
kt_method: Optional[str] = None
555555
kt_cpuinfer: Optional[int] = None
556556
kt_threadpool_count: Optional[int] = None
557+
kt_numa_nodes: Optional[str] = None
557558
kt_num_gpu_experts: Optional[int] = None
558559
kt_gpu_experts_ratio: Optional[float] = None
559560
kt_max_deferred_experts_per_token: Optional[int] = None
@@ -4452,6 +4453,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
44524453
default=2,
44534454
help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
44544455
)
4456+
parser.add_argument(
4457+
"--kt-numa-nodes",
4458+
type=str,
4459+
default=None,
4460+
help="[ktransformers parameter] Comma-separated list of NUMA node IDs for subpool mapping. "
4461+
"E.g. \"1\" to bind to NUMA node 1, or \"2,3\" for nodes 2 and 3. "
4462+
"Must match --kt-threadpool-count in length. "
4463+
"If not set, defaults to sequential IDs [0, 1, ..., threadpool_count-1].",
4464+
)
44554465
parser.add_argument(
44564466
"--kt-num-gpu-experts",
44574467
type=int,

0 commit comments

Comments
 (0)