Skip to content

Commit dbc6dc8

Browse files
committed
fixing integration tests
Signed-off-by: Patryk Saffer <[email protected]>
1 parent 2f2a04b commit dbc6dc8

File tree

2 files changed

+35
-48
lines changed

2 files changed

+35
-48
lines changed

tests/v1/e2e/test_eplb_offline.py

Lines changed: 34 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,63 +5,50 @@
55
import pytest
66
import torch
77

8-
from vllm import LLM, SamplingParams
8+
from vllm import LLM
99
from vllm.distributed import cleanup_dist_env_and_memory
1010
from vllm.engine.arg_utils import EPLBConfig
1111

1212

13-
@pytest.fixture
14-
def sampling_config():
15-
return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
16-
17-
1813
@pytest.mark.parametrize(
1914
"model_setup",
2015
[
2116
("Qwen/Qwen3-Next-80B-A3B-Instruct", 4),
2217
],
23-
ids=["llama4"],
2418
)
2519
def test_eplb_model(
26-
monkeypatch: pytest.MonkeyPatch,
27-
sampling_config: SamplingParams,
2820
model_setup: tuple[str, int],
2921
):
30-
with monkeypatch.context() as m:
31-
m.setenv("VLLM_USE_V1", "1")
32-
33-
model_name, tp_size = model_setup
34-
test_prompts = ["This is a prompt which has more than 10 tokens."]
35-
36-
llm_args = dict(
37-
model=model_name,
38-
tensor_parallel_size=tp_size,
39-
max_model_len=2048,
40-
enable_expert_parallel=True,
41-
num_redundant_experts=tp_size,
42-
eplb_window_size=8,
43-
eplb_step_interval=10,
44-
eplb_log_balancedness=True,
45-
enable_eplb=True,
46-
load_format="dummy",
47-
gpu_memory_utilization=0.95,
48-
)
49-
50-
# Save EPLB statistics to disk
51-
eplb_config_save = EPLBConfig(save_load_window=True, save_dir="/tmp")
52-
llm = LLM(eplb_config=eplb_config_save, **llm_args)
53-
llm.generate(test_prompts, sampling_config)
54-
del llm
55-
torch.cuda.empty_cache()
56-
cleanup_dist_env_and_memory()
57-
58-
# Load EPLB statistics from disk
59-
eplb_config_load = EPLBConfig(
60-
load_initial_load_window=True,
61-
load_path="/tmp/global_expert_load_window_i0.safetensors",
62-
)
63-
llm = LLM(eplb_config=eplb_config_load, **llm_args)
64-
llm.generate(test_prompts, sampling_config)
65-
del llm
66-
torch.cuda.empty_cache()
67-
cleanup_dist_env_and_memory()
22+
model_name, tp_size = model_setup
23+
test_prompt = ["This is a prompt which has more than 10 tokens."]
24+
25+
llm_args = dict(
26+
model=model_name,
27+
tensor_parallel_size=tp_size,
28+
max_model_len=2048,
29+
enable_expert_parallel=True,
30+
enable_eplb=True,
31+
load_format="dummy",
32+
gpu_memory_utilization=0.95,
33+
)
34+
35+
# Save EPLB statistics to disk
36+
eplb_config_save = EPLBConfig(window_size=8,
37+
step_interval=10, save_load_window=True, save_dir="/tmp")
38+
llm = LLM(eplb_config=eplb_config_save, **llm_args)
39+
llm.generate(test_prompt)
40+
del llm
41+
torch.cuda.empty_cache()
42+
cleanup_dist_env_and_memory()
43+
44+
# Load EPLB statistics from disk
45+
eplb_config_load = EPLBConfig(
46+
load_initial_load_window=True,
47+
load_path="/tmp/global_expert_load_window_i0.safetensors",
48+
use_async=True,
49+
)
50+
llm = LLM(eplb_config=eplb_config_load, **llm_args)
51+
llm.generate(test_prompt)
52+
del llm
53+
torch.cuda.empty_cache()
54+
cleanup_dist_env_and_memory()

vllm/v1/worker/gpu_model_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3610,7 +3610,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:
36103610
old_global_expert_indices,
36113611
rank_mapping,
36123612
)
3613-
if self.parallel_config.eplb_config.load_initial_load_window is not None:
3613+
if self.parallel_config.eplb_config.load_initial_load_window:
36143614
self.eplb_state.rearrange(load_initial_load_window=True)
36153615
if self.parallel_config.eplb_config.static:
36163616
self.eplb_state = None

0 commit comments

Comments
 (0)