fix_A3_ACLgraph_sizes_capture_bug_and_add_new_ut

lilinsiman · lilinsiman · commit 1eee250ce123 · 2025-08-13T15:55:56.000+08:00
Signed-off-by: lilinsiman &lt;lilinsiman@gmail.com&gt;
diff --git a/tests/e2e/multicard/test_qwen3_moe.py b/tests/e2e/multicard/test_qwen3_moe.py
@@ -22,7 +22,7 @@
 """
 
 from modelscope import snapshot_download  # type: ignore
-
+import os
 from tests.e2e.conftest import VllmRunner
 
 
@@ -72,3 +72,36 @@ def test_models_distributed_Qwen3_MOE_W8A8():
             enforce_eager=False,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
+    os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "auto"
+    max_tokens = 5
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            enforce_eager=False,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+
+
+def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH():
+    if 'HCCL_OP_EXPANSION_MODE' in os.environ:
+        del os.environ['HCCL_OP_EXPANSION_MODE']
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    dtype = "auto"
+    max_tokens = 5
+    with VllmRunner(
+            "Qwen/Qwen3-30B-A3B",
+            dtype=dtype,
+            tensor_parallel_size=2,
+            enforce_eager=False,
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -17,6 +17,7 @@
 # Adapted from vllm-project/vllm/vllm/worker/worker.py
 #
 
+import os
 import atexit
 import functools
 import math
@@ -325,17 +326,28 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
         num_hidden_layers = get_max_hidden_layers(hf_config)
     parallel_config = vllm_config.parallel_config
 
+    if os.getenv("HCCL_OP_EXPANSION_MODE")=='AIV':
     # TODO: Find out whether we need to take into account the pp_size
-    parallel_factor = 1 + sum(size > 1 for size in [
-        parallel_config.data_parallel_size_local,
-        parallel_config.tensor_parallel_size,
-    ])
-
-    # Calculate maximum supported batch sizes considering model architecture
-    max_num_batch_sizes = math.floor(MAX_CAPTURE_SIZE /
-                                     (num_hidden_layers + 1) / parallel_factor)
-    logger.info("Calculated maximum supported batch sizes for ACL graph: %s",
-                max_num_batch_sizes)
+        parallel_factor = 1 + sum(size > 1 for size in [
+            parallel_config.data_parallel_size,
+            parallel_config.tensor_parallel_size,
+        ])
+
+        # Calculate maximum supported batch sizes considering model architecture
+        max_num_batch_sizes = math.floor(MAX_CAPTURE_SIZE /
+                                        (num_hidden_layers + 1) / parallel_factor)
+        logger.info("Calculated maximum supported batch sizes for ACL graph: %s",
+                    max_num_batch_sizes)
+    else:
+        num_comm_groups = sum(size > 1 for size in [
+            parallel_config.data_parallel_size,
+            parallel_config.tensor_parallel_size,
+        ])
+
+        max_num_batch_sizes = math.floor((MAX_CAPTURE_SIZE - num_comm_groups * 40) / (num_hidden_layers + 1) / (1 + num_comm_groups * 2))
+        logger.info("Calculated maximum supported batch sizes for ACL graph: %s",
+                    max_num_batch_sizes)
+        logger.warning("Unset HCCL_OP_EXPANSION_MODE prevents max size capture. Setting HCCL_OP_EXPANSION_MODE=AIV captures max sizes and boosts ACL graph performance.")
 
     # If original sizes exceed maximum, sample a representative subset
     if max_num_batch_sizes < len(original_sizes):