@@ -325,18 +325,47 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
325
325
num_hidden_layers = get_max_hidden_layers (hf_config )
326
326
parallel_config = vllm_config .parallel_config
327
327
328
- # TODO: Find out whether we need to take into account the pp_size
329
- parallel_factor = 1 + sum (size > 1 for size in [
330
- parallel_config .data_parallel_size_local ,
328
+ num_comm_groups = sum (size > 1 for size in [
329
+ parallel_config .data_parallel_size ,
331
330
parallel_config .tensor_parallel_size ,
332
331
])
333
-
334
- # Calculate maximum supported batch sizes considering model architecture
335
- max_num_batch_sizes = math .floor (MAX_CAPTURE_SIZE /
336
- (num_hidden_layers + 1 ) / parallel_factor )
337
- logger .info ("Calculated maximum supported batch sizes for ACL graph: %s" ,
338
- max_num_batch_sizes )
339
-
332
+ if envs .HCCL_OP_EXPANSION_MODE == 'AIV' :
333
+ # TODO: Find out whether we need to take into account the pp_size
334
+ parallel_factor = 1 + num_comm_groups + int (
335
+ parallel_config .enable_expert_parallel )
336
+ # Calculate maximum supported batch sizes considering model architecture on the A2 Hardware Device
337
+ # Assume the following case:
338
+ # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
339
+ # According to the formula, max_num_batch_sizes = math.floor(1920 / (48 + 1) / 2) = 19
340
+ max_num_batch_sizes = math .floor (
341
+ MAX_CAPTURE_SIZE / (num_hidden_layers + 1 ) / parallel_factor )
342
+ logger .info (
343
+ "Calculated maximum supported batch sizes for ACL graph: %s" ,
344
+ max_num_batch_sizes )
345
+ else :
346
+ # The above describes an empirical formula applicable to the A2 hardware.
347
+ # Under this configuration, HCCL employs the FFTS+ method for execution unfolding,
348
+ # which adds only 1 concurrent stream without consuming collective communication execution unfolding streams.
349
+ # On A3 hardware, HCCL defaults to the AICPU method.
350
+ # This approach may additionally allocate up to rank_size (max 16) - 1 streams per collective communication domain on the device (worst case).
351
+ # Using the default collective communication unfolding method on A3 will lead to a significant reduction in the maximum supported sizes.
352
+ # Therefore, the calculation formula has been modified as follows:
353
+ # Assume the following case:
354
+ # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4,
355
+ # According to the formula, max_num_batch_sizes = math.floor((1920 - 1 * 40) / (48 + 1) / (1 + 1 * 2)) = 12
356
+ max_num_batch_sizes = math .floor (
357
+ (MAX_CAPTURE_SIZE - num_comm_groups * 40 ) /
358
+ (num_hidden_layers + 1 ) / (1 + num_comm_groups * 2 ))
359
+ logger .info (
360
+ "Calculated maximum supported batch sizes for ACL graph: %s" ,
361
+ max_num_batch_sizes )
362
+ logger .warning (
363
+ "Currently, communication is performed using FFTS+ method, which reduces "
364
+ "the number of available streams and, as a result, limits the range of runtime "
365
+ "shapes that can be handled. To both improve communication performance and "
366
+ "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
367
+ )
368
+
340
369
# If original sizes exceed maximum, sample a representative subset
341
370
if max_num_batch_sizes < len (original_sizes ):
342
371
# Sample uniformly from original sizes
0 commit comments