fixing integration tests

Patryk999 · Patryk999 · commit dbc6dc80b827 · 2025-12-03T17:07:05.000Z
Signed-off-by: Patryk Saffer &lt;patryk.saffer99@gmail.com&gt;
diff --git a/tests/v1/e2e/test_eplb_offline.py b/tests/v1/e2e/test_eplb_offline.py
@@ -5,63 +5,50 @@
 import pytest
 import torch
 
-from vllm import LLM, SamplingParams
+from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import EPLBConfig
 
 
-@pytest.fixture
-def sampling_config():
-    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
-
-
 @pytest.mark.parametrize(
     "model_setup",
     [
         ("Qwen/Qwen3-Next-80B-A3B-Instruct", 4),
     ],
-    ids=["llama4"],
 )
 def test_eplb_model(
-    monkeypatch: pytest.MonkeyPatch,
-    sampling_config: SamplingParams,
     model_setup: tuple[str, int],
 ):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
-        model_name, tp_size = model_setup
-        test_prompts = ["This is a prompt which has more than 10 tokens."]
-
-        llm_args = dict(
-            model=model_name,
-            tensor_parallel_size=tp_size,
-            max_model_len=2048,
-            enable_expert_parallel=True,
-            num_redundant_experts=tp_size,
-            eplb_window_size=8,
-            eplb_step_interval=10,
-            eplb_log_balancedness=True,
-            enable_eplb=True,
-            load_format="dummy",
-            gpu_memory_utilization=0.95,
-        )
-
-        # Save EPLB statistics to disk
-        eplb_config_save = EPLBConfig(save_load_window=True, save_dir="/tmp")
-        llm = LLM(eplb_config=eplb_config_save, **llm_args)
-        llm.generate(test_prompts, sampling_config)
-        del llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
-
-        # Load EPLB statistics from disk
-        eplb_config_load = EPLBConfig(
-            load_initial_load_window=True,
-            load_path="/tmp/global_expert_load_window_i0.safetensors",
-        )
-        llm = LLM(eplb_config=eplb_config_load, **llm_args)
-        llm.generate(test_prompts, sampling_config)
-        del llm
-        torch.cuda.empty_cache()
-        cleanup_dist_env_and_memory()
+    model_name, tp_size = model_setup
+    test_prompt = ["This is a prompt which has more than 10 tokens."]
+
+    llm_args = dict(
+        model=model_name,
+        tensor_parallel_size=tp_size,
+        max_model_len=2048,
+        enable_expert_parallel=True,
+        enable_eplb=True,
+        load_format="dummy",
+        gpu_memory_utilization=0.95,
+    )
+
+    # Save EPLB statistics to disk
+    eplb_config_save = EPLBConfig(window_size=8,
+        step_interval=10, save_load_window=True, save_dir="/tmp")
+    llm = LLM(eplb_config=eplb_config_save, **llm_args)
+    llm.generate(test_prompt)
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    # Load EPLB statistics from disk
+    eplb_config_load = EPLBConfig(
+        load_initial_load_window=True,
+        load_path="/tmp/global_expert_load_window_i0.safetensors",
+        use_async=True,
+    )
+    llm = LLM(eplb_config=eplb_config_load, **llm_args)
+    llm.generate(test_prompt)
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -3610,7 +3610,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                 old_global_expert_indices,
                 rank_mapping,
             )
-            if self.parallel_config.eplb_config.load_initial_load_window is not None:
+            if self.parallel_config.eplb_config.load_initial_load_window:
                 self.eplb_state.rearrange(load_initial_load_window=True)
                 if self.parallel_config.eplb_config.static:
                     self.eplb_state = None

Original file line number	Diff line number	Diff line change
`@@ -3610,7 +3610,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:`
`3610`	`3610`	`old_global_expert_indices,`
`3611`	`3611`	`rank_mapping,`
`3612`	`3612`	`)`
`3613`		`- if self.parallel_config.eplb_config.load_initial_load_window is not None:`
	`3613`	`+ if self.parallel_config.eplb_config.load_initial_load_window:`
`3614`	`3614`	`self.eplb_state.rearrange(load_initial_load_window=True)`
`3615`	`3615`	`if self.parallel_config.eplb_config.static:`
`3616`	`3616`	`self.eplb_state = None`