File tree Expand file tree Collapse file tree 2 files changed +24
-0
lines changed Expand file tree Collapse file tree 2 files changed +24
-0
lines changed Original file line number Diff line number Diff line change 10
10
from torch ._inductor .pattern_matcher import PatternMatcherPass
11
11
from torch .distributed ._symmetric_memory import enable_symm_mem_for_group
12
12
13
+ import vllm .envs as envs
13
14
from vllm .config import VllmConfig
14
15
from vllm .distributed import get_tp_group , tensor_model_parallel_all_reduce
15
16
from vllm .distributed .parallel_state import (
@@ -401,6 +402,18 @@ def __call__(self, graph: fx.Graph):
401
402
6 : MiB // 2 , # 512KB
402
403
8 : MiB // 2 , # 512KB
403
404
}
405
+
406
+ try :
407
+ _FI_MAX_SIZES .update ({
408
+ int (k ): int (float (v ) * MiB )
409
+ for k , v in
410
+ envs .VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB .items ()
411
+ })
412
+ except Exception as e :
413
+ raise ValueError (
414
+ "Failed to parse VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB: "
415
+ + str (e )) from e
416
+
404
417
# opt for a more conservative default value
405
418
# when world size is not in _FI_MAX_SIZES
406
419
_DEFAULT_FI_MAX_SIZE = MiB // 2
Original file line number Diff line number Diff line change 2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
4
4
import hashlib
5
+ import json
5
6
import os
6
7
import sys
7
8
import tempfile
@@ -1046,6 +1047,16 @@ def get_vllm_port() -> Optional[int]:
1046
1047
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE" :
1047
1048
lambda : int (os .getenv ("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE" , "163840" )),
1048
1049
1050
+ # Specifies the thresholds of the communicated tensor sizes under which
1051
+ # vllm should use flashinfer fused allreduce. The variable should be a
1052
+ # JSON with the following format:
1053
+ # { <world size>: <max size in mb> }
1054
+ # Unspecified world sizes will fallback to
1055
+ # { 2: 64, 4: 1, <everything else>: 0.5 }
1056
+ "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB" :
1057
+ lambda : json .loads (os .getenv (
1058
+ "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB" , "{}" )),
1059
+
1049
1060
# MoE routing strategy selector.
1050
1061
# See `RoutingSimulator.get_available_strategies()` # for available
1051
1062
# strategies.
You can’t perform that action at this time.
0 commit comments