fix: harden max_concurrency with validation, autoscale coverage, and tests

Arsene12358 · Arsene12358 · commit dcf8615d9458 · 2026-03-03T21:21:53.000+08:00
- to_yaml() now includes max_concurrency when set (was silently dropped)
- Autoscale path (pick_autoscale) now filters by max_concurrency
- Validate max_concurrency &gt;= 1 in TaskConfig (raises ValueError)
- Add INFO-level log when max_concurrency constraint is active
- Update find_best_disagg_result_under_constraints docstring
- Add tests: validation rejects 0/-5, to_yaml round-trip with/without

Signed-off-by: Yimingl &lt;yimingl@nvidia.com&gt;
diff --git a/src/aiconfigurator/sdk/inference_session.py b/src/aiconfigurator/sdk/inference_session.py
@@ -406,6 +406,7 @@ def _pick_autoscale(
         target_ttft: float | None = None,
         target_tpot: float | None = None,
         top_n: int = 5,
+        max_concurrency: int | None = None,
     ) -> InferenceSummary:
         """Pick best prefill and decode engines independently for autoscaling.
 
@@ -427,6 +428,7 @@ def _pick_autoscale(
             target_ttft=target_ttft,
             target_tpot=target_tpot,
             top_n=top_n,
+            max_concurrency=max_concurrency,
         )
 
         disagg_summary_df = result["best_config_df"]
@@ -478,6 +480,13 @@ def find_best_disagg_result_under_constraints(
             decode_max_num_tokens (int): the decode max num tokens
             decode_num_worker_list (List[int]): the decode num worker list
             num_gpu_list (Optional[List[int]]): the num gpu list
+            require_same_tp (bool): require same TP for prefill and decode
+            autoscale (bool): use autoscale picking (P and D chosen independently)
+            target_tpot (Optional[float]): TPOT target for autoscale mode
+            max_concurrency (Optional[int]): maximum global concurrency.
+                Compositions whose ``concurrency`` exceeds this value are
+                excluded from the search in both rate-matching and autoscale
+                paths.
 
         Returns:
             Optional[InferenceSummary]: the summary of the inference result, contains all the
@@ -710,6 +719,7 @@ def _find_best_result_under_constraints(
                 runtime_config=runtime_config,
                 disagg_summary=disagg_summary,
                 target_tpot=target_tpot,
+                max_concurrency=max_concurrency,
             )
 
         # find best result under constraints
diff --git a/src/aiconfigurator/sdk/pareto_analysis.py b/src/aiconfigurator/sdk/pareto_analysis.py
@@ -51,6 +51,9 @@ def agg_pareto(
         results_df: dataframe of the results
     """
 
+    if max_concurrency is not None:
+        logger.info("agg_pareto: max_concurrency=%d is active; capping batch-size sweep per config", max_concurrency)
+
     # agg is agg server, the loop over parallel is outside here.
     results_df = pd.DataFrame(columns=ColumnsAgg)
     exceptions = []
@@ -284,6 +287,8 @@ def get_working_list(working_list, max_constraint):
     autoscale = kwargs.get("autoscale", False)
     target_tpot = kwargs.get("target_tpot")
     max_concurrency = kwargs.get("max_concurrency")
+    if max_concurrency is not None:
+        logger.info("disagg_pareto: max_concurrency=%d is active; filtering compositions", max_concurrency)
 
     summary = disagg_sess.find_best_disagg_result_under_constraints(
         model_path=model_path,
diff --git a/src/aiconfigurator/sdk/picking.py b/src/aiconfigurator/sdk/picking.py
@@ -373,6 +373,7 @@ def pick_autoscale(
     target_ttft: float,
     target_tpot: float,
     top_n: int = 5,
+    max_concurrency: int | None = None,
 ) -> dict[str, Any]:
     """Pick prefill and decode engines independently for autoscaling.
 
@@ -462,6 +463,8 @@ def pick_autoscale(
                 decode_summary_dict=d_row.to_dict(),
                 decode_num_worker=1,
             )
+            if max_concurrency is not None and combo["concurrency"] > max_concurrency:
+                continue
             all_combos.append(combo)
 
     if not all_combos:
diff --git a/src/aiconfigurator/sdk/task.py b/src/aiconfigurator/sdk/task.py
@@ -638,6 +638,9 @@ def __init__(
             effective_profiles = list(dict.fromkeys([*effective_profiles, *yaml_profiles]))
             yaml_patch = yaml_config.get("config", yaml_config)
 
+        if max_concurrency is not None and max_concurrency < 1:
+            raise ValueError(f"max_concurrency must be >= 1, got {max_concurrency}")
+
         ctx = TaskContext(
             serving_mode=serving_mode,
             model_path=model_path,
@@ -911,6 +914,8 @@ def _convert(obj: Any) -> Any:
         )
 
         printable["enable_wideep"] = self.enable_wideep
+        if self.max_concurrency is not None:
+            printable["max_concurrency"] = self.max_concurrency
         printable["moe_backend"] = self.config.moe_backend
         printable["attention_backend"] = self.config.attention_backend
 
diff --git a/tests/unit/sdk/task/test_task.py b/tests/unit/sdk/task/test_task.py
@@ -832,3 +832,52 @@ def test_agg_max_concurrency_none_by_default(monkeypatch):
     TaskRunner().run(task)
 
     assert captured.get("max_concurrency") is None
+
+
+def test_taskconfig_max_concurrency_zero_rejected():
+    """max_concurrency=0 should raise ValueError."""
+    with pytest.raises(ValueError, match=r"max_concurrency must be >= 1"):
+        TaskConfig(
+            serving_mode="agg",
+            model_path="Qwen/Qwen3-32B",
+            system_name="h200_sxm",
+            max_concurrency=0,
+        )
+
+
+def test_taskconfig_max_concurrency_negative_rejected():
+    """Negative max_concurrency should raise ValueError."""
+    with pytest.raises(ValueError, match=r"max_concurrency must be >= 1"):
+        TaskConfig(
+            serving_mode="agg",
+            model_path="Qwen/Qwen3-32B",
+            system_name="h200_sxm",
+            max_concurrency=-5,
+        )
+
+
+def test_taskconfig_to_yaml_includes_max_concurrency():
+    """to_yaml() must include max_concurrency when it is set."""
+    task = TaskConfig(
+        serving_mode="agg",
+        model_path="Qwen/Qwen3-32B",
+        system_name="h200_sxm",
+        max_concurrency=256,
+    )
+    yaml_output = task.to_yaml()
+    parsed = yaml.safe_load(yaml_output)
+    task_name = task.task_name
+    assert parsed[task_name]["max_concurrency"] == 256
+
+
+def test_taskconfig_to_yaml_omits_max_concurrency_when_none():
+    """to_yaml() must not include max_concurrency when it is None."""
+    task = TaskConfig(
+        serving_mode="agg",
+        model_path="Qwen/Qwen3-32B",
+        system_name="h200_sxm",
+    )
+    yaml_output = task.to_yaml()
+    parsed = yaml.safe_load(yaml_output)
+    task_name = task.task_name
+    assert "max_concurrency" not in parsed[task_name]