ai-dynamo
diff --git a/‎benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml‎
Lines changed: 10 additions & 17 deletions b/‎benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml‎
Lines changed: 10 additions & 17 deletions
diff --git a/‎benchmarks/profiler/deploy/profile_sla_dgdr.yaml‎
Lines changed: 8 additions & 13 deletions b/‎benchmarks/profiler/deploy/profile_sla_dgdr.yaml‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml‎
Lines changed: 19 additions & 16 deletions b/‎benchmarks/profiler/deploy/profile_sla_moe_dgdr.yaml‎
Lines changed: 19 additions & 16 deletions
@@ -12,26 +12,19 @@ spec:
 
   # ProfilingConfig maps directly to the profile_sla.py config format
   profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
+
+    # NOTE: any image built before January 10 and any release prior to 0.8.1
+    # will need to use snake_case within profilingConfig.config
     config:
-      # Sweep/profiling configuration
       sweep:
-        # AI Configurator mode (fast simulation-based profiling)
-        use_ai_configurator: true
-        aic_system: h200_sxm
-        aic_hf_id: Qwen/Qwen3-32B
-        aic_backend_version: "0.20.0"
-
-      # SLA targets for profiling
+        useAiConfigurator: true
+        aicSystem: h200_sxm
       sla:
-        isl: 3000   # Input sequence length
-        osl: 150    # Output sequence length
-        ttft: 500.0 # Time To First Token target (milliseconds)
-        itl: 30.0   # Inter-Token Latency target (milliseconds)
-
-  # Deployment overrides for the auto-created DGD
+        isl: 3000
+        osl: 150
+        ttft: 500.0
+        itl: 30.0
   deploymentOverrides:
     workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
-
-  # Automatically create DynamoGraphDeployment after profiling
   autoApply: true
@@ -13,22 +13,17 @@ spec:
   # ProfilingConfig maps directly to the profile_sla.py config format
   profilingConfig:
     profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
+
+    # NOTE: any image built before January 10 and any release prior to 0.8.1
+    # will need to use snake_case within profilingConfig.config
     config:
-      # Sweep/profiling configuration
       sweep:
-        # Online profiling mode (real deployment testing)
-        use_ai_configurator: false
-
-      # SLA targets for profiling
+        useAiConfigurator: false
       sla:
-        isl: 3000   # Input sequence length
-        osl: 150    # Output sequence length
-        ttft: 200.0 # Time To First Token target (milliseconds)
-        itl: 20.0   # Inter-Token Latency target (milliseconds)
-
-  # Deployment overrides for the auto-created DGD
+        isl: 3000
+        osl: 150
+        ttft: 200.0
+        itl: 20.0
   deploymentOverrides:
     workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
-
-  # Automatically create DynamoGraphDeployment after profiling
   autoApply: true
@@ -13,25 +13,31 @@ spec:
 
   # ProfilingConfig maps directly to the profile_sla.py config format
   profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
+
+    # NOTE: any image built before January 10 and any release prior to 0.8.1
+    # will need to use snake_case within profilingConfig.config
     config:
-      # Sweep/profiling configuration
+      # 0.8.1 and later: Model cache PVC to access model weights
+      deployment:
+        modelCache:
+          pvcName: "model-cache"                      # Name of PVC containing model weights
+          pvcPath: "deepseek-r1"                      # Subpath within PVC where model is stored
+
       sweep:
-        # Standard online profiling (not using AI Configurator)
-        use_ai_configurator: false
+        useAiConfigurator: false
 
       hardware:
         # for h200, sweep over 8-16 GPUs per engine
-        min_num_gpus_per_engine: 8
-        max_num_gpus_per_engine: 16
-        num_gpus_per_node: 8
+        minNumGpusPerEngine: 8
+        maxNumGpusPerEngine: 16
+        numGpusPerNode: 8
 
-      # SLA targets for profiling
       sla:
-        isl: 3000   # Input sequence length
-        osl: 150    # Output sequence length
-        ttft: 200.0 # Time To First Token target (milliseconds)
-        itl: 20.0   # Inter-Token Latency target (milliseconds)
+        isl: 3000
+        osl: 150
+        ttft: 200.0
+        itl: 20.0
 
     # Reference to ConfigMap containing the DGD base config
     # For MoE models, this should point to the appropriate disagg config
@@ -40,10 +46,7 @@ spec:
       name: deepseek-r1-config
       key: tep16p-dep16d-disagg.yaml
 
-  # Deployment overrides for the auto-created DGD
   deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
-
-  # Automatically create DynamoGraphDeployment after profiling
+    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
   autoApply: true