|
5 | 5 | import pytest |
6 | 6 | import torch |
7 | 7 |
|
8 | | -from vllm import LLM, SamplingParams |
| 8 | +from vllm import LLM |
9 | 9 | from vllm.distributed import cleanup_dist_env_and_memory |
10 | 10 | from vllm.engine.arg_utils import EPLBConfig |
11 | 11 |
|
12 | 12 |
|
13 | | -@pytest.fixture |
14 | | -def sampling_config(): |
15 | | - return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) |
16 | | - |
17 | | - |
18 | 13 | @pytest.mark.parametrize( |
19 | 14 | "model_setup", |
20 | 15 | [ |
21 | 16 | ("Qwen/Qwen3-Next-80B-A3B-Instruct", 4), |
22 | 17 | ], |
23 | | - ids=["llama4"], |
24 | 18 | ) |
25 | 19 | def test_eplb_model( |
26 | | - monkeypatch: pytest.MonkeyPatch, |
27 | | - sampling_config: SamplingParams, |
28 | 20 | model_setup: tuple[str, int], |
29 | 21 | ): |
30 | | - with monkeypatch.context() as m: |
31 | | - m.setenv("VLLM_USE_V1", "1") |
32 | | - |
33 | | - model_name, tp_size = model_setup |
34 | | - test_prompts = ["This is a prompt which has more than 10 tokens."] |
35 | | - |
36 | | - llm_args = dict( |
37 | | - model=model_name, |
38 | | - tensor_parallel_size=tp_size, |
39 | | - max_model_len=2048, |
40 | | - enable_expert_parallel=True, |
41 | | - num_redundant_experts=tp_size, |
42 | | - eplb_window_size=8, |
43 | | - eplb_step_interval=10, |
44 | | - eplb_log_balancedness=True, |
45 | | - enable_eplb=True, |
46 | | - load_format="dummy", |
47 | | - gpu_memory_utilization=0.95, |
48 | | - ) |
49 | | - |
50 | | - # Save EPLB statistics to disk |
51 | | - eplb_config_save = EPLBConfig(save_load_window=True, save_dir="/tmp") |
52 | | - llm = LLM(eplb_config=eplb_config_save, **llm_args) |
53 | | - llm.generate(test_prompts, sampling_config) |
54 | | - del llm |
55 | | - torch.cuda.empty_cache() |
56 | | - cleanup_dist_env_and_memory() |
57 | | - |
58 | | - # Load EPLB statistics from disk |
59 | | - eplb_config_load = EPLBConfig( |
60 | | - load_initial_load_window=True, |
61 | | - load_path="/tmp/global_expert_load_window_i0.safetensors", |
62 | | - ) |
63 | | - llm = LLM(eplb_config=eplb_config_load, **llm_args) |
64 | | - llm.generate(test_prompts, sampling_config) |
65 | | - del llm |
66 | | - torch.cuda.empty_cache() |
67 | | - cleanup_dist_env_and_memory() |
| 22 | + model_name, tp_size = model_setup |
| 23 | + test_prompt = ["This is a prompt which has more than 10 tokens."] |
| 24 | + |
| 25 | + llm_args = dict( |
| 26 | + model=model_name, |
| 27 | + tensor_parallel_size=tp_size, |
| 28 | + max_model_len=2048, |
| 29 | + enable_expert_parallel=True, |
| 30 | + enable_eplb=True, |
| 31 | + load_format="dummy", |
| 32 | + gpu_memory_utilization=0.95, |
| 33 | + ) |
| 34 | + |
| 35 | + # Save EPLB statistics to disk |
| 36 | + eplb_config_save = EPLBConfig(window_size=8, |
| 37 | + step_interval=10, save_load_window=True, save_dir="/tmp") |
| 38 | + llm = LLM(eplb_config=eplb_config_save, **llm_args) |
| 39 | + llm.generate(test_prompt) |
| 40 | + del llm |
| 41 | + torch.cuda.empty_cache() |
| 42 | + cleanup_dist_env_and_memory() |
| 43 | + |
| 44 | + # Load EPLB statistics from disk |
| 45 | + eplb_config_load = EPLBConfig( |
| 46 | + load_initial_load_window=True, |
| 47 | + load_path="/tmp/global_expert_load_window_i0.safetensors", |
| 48 | + use_async=True, |
| 49 | + ) |
| 50 | + llm = LLM(eplb_config=eplb_config_load, **llm_args) |
| 51 | + llm.generate(test_prompt) |
| 52 | + del llm |
| 53 | + torch.cuda.empty_cache() |
| 54 | + cleanup_dist_env_and_memory() |
0 commit comments