Skip to content

Commit 4414d64

Browse files
committed
fix_A3_ACLgraph_sizes_capture_bug_and_add_new_ut
Signed-off-by: lilinsiman <[email protected]>
1 parent 992271b commit 4414d64

File tree

3 files changed

+78
-10
lines changed

3 files changed

+78
-10
lines changed

tests/e2e/multicard/test_qwen3_moe.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
2222
"""
2323

24+
25+
import os
26+
2427
from modelscope import snapshot_download # type: ignore
2528

2629
from tests.e2e.conftest import VllmRunner
@@ -72,3 +75,36 @@ def test_models_distributed_Qwen3_MOE_W8A8():
7275
enforce_eager=False,
7376
) as vllm_model:
7477
vllm_model.generate_greedy(example_prompts, max_tokens)
78+
79+
80+
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV():
81+
os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV'
82+
example_prompts = [
83+
"Hello, my name is",
84+
]
85+
dtype = "auto"
86+
max_tokens = 5
87+
with VllmRunner(
88+
"Qwen/Qwen3-30B-A3B",
89+
dtype=dtype,
90+
tensor_parallel_size=2,
91+
enforce_eager=False,
92+
) as vllm_model:
93+
vllm_model.generate_greedy(example_prompts, max_tokens)
94+
95+
96+
def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH():
97+
if 'HCCL_OP_EXPANSION_MODE' in os.environ:
98+
del os.environ['HCCL_OP_EXPANSION_MODE']
99+
example_prompts = [
100+
"Hello, my name is",
101+
]
102+
dtype = "auto"
103+
max_tokens = 5
104+
with VllmRunner(
105+
"Qwen/Qwen3-30B-A3B",
106+
dtype=dtype,
107+
tensor_parallel_size=2,
108+
enforce_eager=False,
109+
) as vllm_model:
110+
vllm_model.generate_greedy(example_prompts, max_tokens)

vllm_ascend/envs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@
5555
# Please make sure that the version is correct.
5656
"SOC_VERSION":
5757
lambda: os.getenv("SOC_VERSION", "ASCEND910B1"),
58+
# location for orchestrated deployment of communication algorithms.
59+
"HCCL_OP_EXPANSION_MODE":
60+
lambda: os.environ.get("HCCL_OP_EXPANSION_MODE", None),
5861
# If set, vllm-ascend will print verbose logs during compilation
5962
"VERBOSE":
6063
lambda: bool(int(os.getenv('VERBOSE', '0'))),

vllm_ascend/utils.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -325,18 +325,47 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
325325
num_hidden_layers = get_max_hidden_layers(hf_config)
326326
parallel_config = vllm_config.parallel_config
327327

328-
# TODO: Find out whether we need to take into account the pp_size
329-
parallel_factor = 1 + sum(size > 1 for size in [
330-
parallel_config.data_parallel_size_local,
328+
num_comm_groups = sum(size > 1 for size in [
329+
parallel_config.data_parallel_size,
331330
parallel_config.tensor_parallel_size,
332331
])
333-
334-
# Calculate maximum supported batch sizes considering model architecture
335-
max_num_batch_sizes = math.floor(MAX_CAPTURE_SIZE /
336-
(num_hidden_layers + 1) / parallel_factor)
337-
logger.info("Calculated maximum supported batch sizes for ACL graph: %s",
338-
max_num_batch_sizes)
339-
332+
if envs.HCCL_OP_EXPANSION_MODE == 'AIV':
333+
# TODO: Find out whether we need to take into account the pp_size
334+
parallel_factor = 1 + num_comm_groups + int(
335+
parallel_config.enable_expert_parallel)
336+
# Calculate maximum supported batch sizes considering model architecture on the A2 Hardware Device
337+
# Assume the following case:
338+
# MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
339+
# According to the formula, max_num_batch_sizes = math.floor(1920 / (48 + 1) / 2) = 19
340+
max_num_batch_sizes = math.floor(
341+
MAX_CAPTURE_SIZE / (num_hidden_layers + 1) / parallel_factor)
342+
logger.info(
343+
"Calculated maximum supported batch sizes for ACL graph: %s",
344+
max_num_batch_sizes)
345+
else:
346+
# The above describes an empirical formula applicable to the A2 hardware.
347+
# Under this configuration, HCCL employs the FFTS+ method for execution unfolding,
348+
# which adds only 1 concurrent stream without consuming collective communication execution unfolding streams.
349+
# On A3 hardware, HCCL defaults to the AICPU method.
350+
# This approach may additionally allocate up to rank_size (max 16) - 1 streams per collective communication domain on the device (worst case).
351+
# Using the default collective communication unfolding method on A3 will lead to a significant reduction in the maximum supported sizes.
352+
# Therefore, the calculation formula has been modified as follows:
353+
# Assume the following case:
354+
# MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
355+
# According to the formula, max_num_batch_sizes = math.floor((1920 - 1 * 40) / (48 + 1) / (1 + 1 * 2)) = 12
356+
max_num_batch_sizes = math.floor(
357+
(MAX_CAPTURE_SIZE - num_comm_groups * 40) /
358+
(num_hidden_layers + 1) / (1 + num_comm_groups * 2))
359+
logger.info(
360+
"Calculated maximum supported batch sizes for ACL graph: %s",
361+
max_num_batch_sizes)
362+
logger.warning(
363+
"Currently, communication is performed using FFTS+ method, which reduces "
364+
"the number of available streams and, as a result, limits the range of runtime "
365+
"shapes that can be handled. To both improve communication performance and "
366+
"increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
367+
)
368+
340369
# If original sizes exceed maximum, sample a representative subset
341370
if max_num_batch_sizes < len(original_sizes):
342371
# Sample uniformly from original sizes

0 commit comments

Comments
 (0)