Skip to content

Commit 2745242

Browse files
authored
cherry-pick: DGDR profilingConfig casing and doc updates, model PVC (#5537)
Signed-off-by: Hannah Zhang <[email protected]>
1 parent 21a9e23 commit 2745242

13 files changed

+375
-213
lines changed

benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,19 @@ spec:
1212

1313
# ProfilingConfig maps directly to the profile_sla.py config format
1414
profilingConfig:
15-
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
15+
profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
16+
17+
# NOTE: any image built before January 10 and any release prior to 0.8.1
18+
# will need to use snake_case within profilingConfig.config
1619
config:
17-
# Sweep/profiling configuration
1820
sweep:
19-
# AI Configurator mode (fast simulation-based profiling)
20-
use_ai_configurator: true
21-
aic_system: h200_sxm
22-
aic_hf_id: Qwen/Qwen3-32B
23-
aic_backend_version: "0.20.0"
24-
25-
# SLA targets for profiling
21+
useAiConfigurator: true
22+
aicSystem: h200_sxm
2623
sla:
27-
isl: 3000 # Input sequence length
28-
osl: 150 # Output sequence length
29-
ttft: 500.0 # Time To First Token target (milliseconds)
30-
itl: 30.0 # Inter-Token Latency target (milliseconds)
31-
32-
# Deployment overrides for the auto-created DGD
24+
isl: 3000
25+
osl: 150
26+
ttft: 500.0
27+
itl: 30.0
3328
deploymentOverrides:
3429
workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
35-
36-
# Automatically create DynamoGraphDeployment after profiling
3730
autoApply: true

benchmarks/profiler/deploy/profile_sla_dgdr.yaml

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,17 @@ spec:
1313
# ProfilingConfig maps directly to the profile_sla.py config format
1414
profilingConfig:
1515
profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
16+
17+
# NOTE: any image built before January 10 and any release prior to 0.8.1
18+
# will need to use snake_case within profilingConfig.config
1619
config:
17-
# Sweep/profiling configuration
1820
sweep:
19-
# Online profiling mode (real deployment testing)
20-
use_ai_configurator: false
21-
22-
# SLA targets for profiling
21+
useAiConfigurator: false
2322
sla:
24-
isl: 3000 # Input sequence length
25-
osl: 150 # Output sequence length
26-
ttft: 200.0 # Time To First Token target (milliseconds)
27-
itl: 20.0 # Inter-Token Latency target (milliseconds)
28-
29-
# Deployment overrides for the auto-created DGD
23+
isl: 3000
24+
osl: 150
25+
ttft: 200.0
26+
itl: 20.0
3027
deploymentOverrides:
3128
workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
32-
33-
# Automatically create DynamoGraphDeployment after profiling
3429
autoApply: true

benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,31 @@ spec:
1313

1414
# ProfilingConfig maps directly to the profile_sla.py config format
1515
profilingConfig:
16-
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
16+
profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
17+
18+
# NOTE: any image built before January 10 and any release prior to 0.8.1
19+
# will need to use snake_case within profilingConfig.config
1720
config:
18-
# Sweep/profiling configuration
21+
# 0.8.1 and later: Model cache PVC to access model weights
22+
deployment:
23+
modelCache:
24+
pvcName: "model-cache" # Name of PVC containing model weights
25+
pvcPath: "deepseek-r1" # Subpath within PVC where model is stored
26+
1927
sweep:
20-
# Standard online profiling (not using AI Configurator)
21-
use_ai_configurator: false
28+
useAiConfigurator: false
2229

2330
hardware:
2431
# for h200, sweep over 8-16 GPUs per engine
25-
min_num_gpus_per_engine: 8
26-
max_num_gpus_per_engine: 16
27-
num_gpus_per_node: 8
32+
minNumGpusPerEngine: 8
33+
maxNumGpusPerEngine: 16
34+
numGpusPerNode: 8
2835

29-
# SLA targets for profiling
3036
sla:
31-
isl: 3000 # Input sequence length
32-
osl: 150 # Output sequence length
33-
ttft: 200.0 # Time To First Token target (milliseconds)
34-
itl: 20.0 # Inter-Token Latency target (milliseconds)
37+
isl: 3000
38+
osl: 150
39+
ttft: 200.0
40+
itl: 20.0
3541

3642
# Reference to ConfigMap containing the DGD base config
3743
# For MoE models, this should point to the appropriate disagg config
@@ -40,10 +46,7 @@ spec:
4046
name: deepseek-r1-config
4147
key: tep16p-dep16d-disagg.yaml
4248

43-
# Deployment overrides for the auto-created DGD
4449
deploymentOverrides:
45-
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
46-
47-
# Automatically create DynamoGraphDeployment after profiling
50+
workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
4851
autoApply: true
4952

0 commit comments

Comments
 (0)