diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index e6917c6e25..dca9e142a7 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -441,7 +441,7 @@ License: `Apache` - `Homepage`: https://github.com/huggingface/accelerate -## aiconfigurator (0.2.0) +## aiconfigurator (0.4.0) ### Licenses License: `Apache-2.0` diff --git a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml index 2c2784c561..d8b15635cc 100644 --- a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml +++ b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml @@ -19,8 +19,6 @@ spec: # AI Configurator mode (fast simulation-based profiling) use_ai_configurator: true aic_system: h200_sxm - aic_model_name: QWEN3_32B - aic_backend_version: "0.20.0" # SLA targets for profiling sla: diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index aa7ef2cce5..6b75bd8fab 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -149,9 +149,9 @@ async def run_profile(args): raise ValueError( "Must provide --aic-system when using --use-ai-configurator." ) - if not args.aic_model_name: + if not args.aic_hf_id: raise ValueError( - "Must provide --aic-model-name when using --use-ai-configurator." + "Must provide --aic-hf-id when using --use-ai-configurator." ) if not args.aic_backend_version: raise ValueError( @@ -160,15 +160,15 @@ async def run_profile(args): logger.info("Will use aiconfigurator to estimate perf.") ai_configurator_perf_estimator = AIConfiguratorPerfEstimator( - args.aic_model_name, + args.aic_hf_id, args.aic_system.lower(), args.aic_backend, args.aic_backend_version, ) else: - if args.aic_system or args.aic_model_name or args.aic_backend_version: + if args.aic_system or args.aic_hf_id or args.aic_backend_version: logger.warning( - "Will ignore --aic-system, --aic-model-name, and/or --backend-version " + "Will ignore --aic-system, --aic-hf-id, and/or --backend-version " "when not using --use-ai-configurator." ) diff --git a/benchmarks/profiler/utils/profiler_argparse.py b/benchmarks/profiler/utils/profiler_argparse.py index 5ae7b18bf1..6f6ec0ae7c 100644 --- a/benchmarks/profiler/utils/profiler_argparse.py +++ b/benchmarks/profiler/utils/profiler_argparse.py @@ -82,7 +82,7 @@ def create_profiler_parser() -> argparse.Namespace: decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6) use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False) aic_system: String (target system for use with aiconfigurator, default: None) - aic_model_name: String (aiconfigurator name of the target model, default: None) + aic_hf_id: String (aiconfigurator name of the target model, default: None) aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "") aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None) dry_run: Boolean (dry run the profile job, default: False) @@ -281,10 +281,10 @@ def create_profiler_parser() -> argparse.Namespace: help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)", ) parser.add_argument( - "--aic-model-name", + "--aic-hf-id", type=str, - default=config.get("sweep", {}).get("aic_model_name"), - help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)", + default=config.get("sweep", {}).get("aic_hf_id"), + help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)", ) parser.add_argument( "--aic-backend", diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index d99b7c611c..9ee8804cd9 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ ] dependencies = [ - "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a", + "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759", "networkx", "pandas", "pydantic>=2", diff --git a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml index 4c0e2982d0..a232a84748 100644 --- a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml +++ b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml @@ -54,7 +54,7 @@ spec: # AI Configurator mode (fast simulation-based profiling, 20-30 seconds) use_ai_configurator: false # Set to false for online profiling (2-4 hours) aic_system: h200_sxm # Target GPU system for AI Configurator - aic_model_name: QWEN3_0.6B # Model name for AI Configurator + aic_hf_id: Qwen/Qwen3-0.6B # Model name for AI Configurator aic_backend_version: "0.20.0" # Backend version for AI Configurator # SLA targets for profiling diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go index 1440b24488..7091d703ed 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go @@ -350,7 +350,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { "sweep": map[string]interface{}{ "use_ai_configurator": true, "aic_system": "h200_sxm", - "aic_model_name": "QWEN3_32B", + "aic_hf_id": "Qwen/Qwen3-32B", "aic_backend_version": "0.20.0", }, }), @@ -1060,7 +1060,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { "sweep": map[string]interface{}{ "use_ai_configurator": true, "aic_system": "h200_sxm", - "aic_model_name": "QWEN3_32B", + "aic_hf_id": "Qwen/Qwen3-32B", "aic_backend_version": "0.20.0", }, }), diff --git a/docs/benchmarks/sla_driven_profiling.md b/docs/benchmarks/sla_driven_profiling.md index a9fec61324..d2fc6c25d8 100644 --- a/docs/benchmarks/sla_driven_profiling.md +++ b/docs/benchmarks/sla_driven_profiling.md @@ -299,17 +299,12 @@ profilingConfig: sweep: use_ai_configurator: true aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm - aic_model_name: QWEN3_32B # AIC model identifier (see supported list) + aic_hf_id: Qwen/Qwen3-32B # AIC model identifier (see supported list) aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3, 1.0.0rc6 ``` **Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) -**Model name mapping examples:** -- `Qwen/Qwen3-32B` → `QWEN3_32B` -- `meta-llama/Llama-3.1-70B` → `LLAMA3.1_70B` -- `deepseek-ai/DeepSeek-V3` → `DEEPSEEK_V3` - ### Planner Configuration (Optional) Pass arguments to the SLA planner: diff --git a/docs/planner/sla_planner_quickstart.md b/docs/planner/sla_planner_quickstart.md index e504a16758..eec2eac74f 100644 --- a/docs/planner/sla_planner_quickstart.md +++ b/docs/planner/sla_planner_quickstart.md @@ -230,7 +230,7 @@ sweep: sweep: use_ai_configurator: true aic_system: h200_sxm - aic_model_name: QWEN3_32B + aic_hf_id: Qwen/Qwen3-32B aic_backend_version: "0.20.0" ``` diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py index 769140a910..650e5ed2b8 100644 --- a/tests/profiler/test_profile_sla_aiconfigurator.py +++ b/tests/profiler/test_profile_sla_aiconfigurator.py @@ -49,7 +49,7 @@ def __init__(self): self.dry_run = False self.use_ai_configurator = True self.aic_system = "h200_sxm" - self.aic_model_name = "QWEN3_32B" + self.aic_hf_id = "Qwen/Qwen3-32B" self.aic_backend = "" self.aic_backend_version = "0.20.0" self.num_gpus_per_node = 8 @@ -60,7 +60,7 @@ def __init__(self): @pytest.mark.pre_merge @pytest.mark.asyncio @pytest.mark.parametrize( - "missing_arg", ["aic_system", "aic_model_name", "aic_backend_version"] + "missing_arg", ["aic_system", "aic_hf_id", "aic_backend_version"] ) async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg): # Check that validation error happens when a required arg is missing. @@ -99,12 +99,12 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): ("trtllm", "1.0.0rc3"), ], ) - @pytest.mark.parametrize("model_name", ["QWEN3_32B", "LLAMA3.1_405B"]) + @pytest.mark.parametrize("hf_model_id", ["Qwen/Qwen3-32B", "meta-llama/Llama-3.1-405B"]) async def test_trtllm_aiconfigurator_many( - self, trtllm_args, model_name, backend, aic_backend_version + self, trtllm_args, hf_model_id, backend, aic_backend_version ): # Test that profile_sla works with a variety of backend versions and model names. - trtllm_args.aic_model_name = model_name + trtllm_args.aic_hf_id = hf_model_id trtllm_args.backend = backend trtllm_args.aic_backend_version = aic_backend_version await run_profile(trtllm_args) diff --git a/tests/profiler/test_profile_sla_dryrun.py b/tests/profiler/test_profile_sla_dryrun.py index eaf0a3c9de..676975fcd8 100644 --- a/tests/profiler/test_profile_sla_dryrun.py +++ b/tests/profiler/test_profile_sla_dryrun.py @@ -67,7 +67,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -103,7 +103,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -153,7 +153,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -196,7 +196,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -262,7 +262,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 # Will be overridden by auto-generation @@ -328,7 +328,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 # Will be overridden by auto-generation @@ -394,7 +394,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 # Will be overridden by auto-generation