more fixes

Patryk999 · Patryk999 · commit ecec01d80e6a · 2025-11-14T18:03:27.000Z
Signed-off-by: Patryk Saffer &lt;patryk.saffer99@gmail.com&gt;
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -63,17 +63,14 @@ class EPLBConfig:
     This is turned off by default since it will cause communication overhead.
     """
 
+    load_initial_load_window: bool = False
+    save_load_window: bool = False
+    static: bool = False
     save_dir: Path | None = None
     """Directory to save expert load balance metrics."""
     load_path: Path | None = None
     """Path to load expert load balance metrics."""
 
-    @property
-    def record_metrics(self) -> bool:
-        return self.save_dir is not None or (
-            self.save_dir is None and self.load_path is None
-        )
-
 
 @config
 @dataclass
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
@@ -352,10 +352,8 @@ def add_model(
             device=self.device,
         )
 
-        eplb_load_path = self.parallel_config.eplb_config.load_path
-        eplb_save_dir = self.parallel_config.eplb_config.save_dir
         eplb_step_interval = self.parallel_config.eplb_config.step_interval
-        if eplb_load_path is not None or eplb_save_dir is not None:
+        if self.parallel_config.eplb_config.load_initial_load_window or self.parallel_config.eplb_config.save_load_window:
             self.expert_rearrangement_step = 0
         else:
             # Set the initial progress of rearrangement to 3/4
@@ -579,7 +577,7 @@ def rearrange(
             # Map the physical expert load to global logical experts
             global_expert_load_windows = []
             should_save_eplb_state = (
-                self.parallel_config.eplb_config.save_dir is not None
+                self.parallel_config.eplb_config.save_load_window
                 and not is_profile
                 and self.expert_rearrangement_step > 0
             )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -415,8 +415,6 @@ class EngineArgs:
     eplb_window_size: int = EPLBConfig.window_size
     eplb_step_interval: int = EPLBConfig.step_interval
     eplb_log_balancedness: bool = EPLBConfig.log_balancedness
-    eplb_save_dir: Path | None = EPLBConfig.save_dir
-    eplb_load_path: Path | None = EPLBConfig.load_path
     max_parallel_loading_workers: int | None = (
         ParallelConfig.max_parallel_loading_workers
     )
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1246,7 +1246,7 @@ def eplb_map_to_physical_and_record(
     expert_load_view: torch.Tensor,
     logical_to_physical_map: torch.Tensor,
     logical_replica_count: torch.Tensor,
-    eplb_record_metrics: bool = False,
+    eplb_static: bool = False,
     indices_type: torch.dtype | None = None,
 ) -> torch.Tensor:
     """
@@ -1288,7 +1288,7 @@ def eplb_map_to_physical_and_record(
 
     topk_ids = physical_ids
 
-    if eplb_record_metrics:
+    if eplb_static:
         # 2. Record expert load metrics.
 
         # TODO(bowen): When using `FusedMoEModularKernel`, this
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -100,7 +100,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -134,7 +134,7 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             enable_eplb=enable_eplb,
-            eplb_record_metrics=eplb_record_metrics,
+            eplb_static=eplb_static,
             expert_map=expert_map,
             expert_load_view=expert_load_view,
             logical_to_physical_map=logical_to_physical_map,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -316,7 +316,7 @@ def __init__(
         activation: str = "silu",
         is_act_and_mul: bool = True,
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
@@ -398,7 +398,7 @@ def __init__(
         self.layer_name = prefix
 
         self.enable_eplb = enable_eplb
-        self.eplb_record_metrics = eplb_record_metrics
+        self.eplb_static = eplb_static
         self.expert_load_view: torch.Tensor | None = None
         self.logical_to_physical_map: torch.Tensor | None = None
         self.logical_replica_count: torch.Tensor | None = None
@@ -1320,7 +1320,7 @@ def select_experts(
         e_score_correction_bias: torch.Tensor | None = None,
         indices_type: torch.dtype | None = None,
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_map: torch.Tensor | None = None,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
@@ -1423,7 +1423,7 @@ def select_experts(
                 topk_ids=topk_ids,
                 expert_load_view=expert_load_view,
                 logical_to_physical_map=logical_to_physical_map,
-                eplb_record_metrics=eplb_record_metrics,
+                eplb_static=eplb_static,
                 logical_replica_count=logical_replica_count,
                 indices_type=indices_type,
             )
@@ -1610,7 +1610,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 e_score_correction_bias=self.e_score_correction_bias,
                 activation=self.activation,
                 enable_eplb=self.enable_eplb,
-                eplb_record_metrics=self.eplb_record_metrics,
+                eplb_static=self.eplb_static,
                 expert_load_view=self.expert_load_view,
                 logical_to_physical_map=self.logical_to_physical_map,
                 logical_replica_count=self.logical_replica_count,
@@ -1779,7 +1779,7 @@ def forward_impl(
                 activation=self.activation,
                 apply_router_weight_on_input=self.apply_router_weight_on_input,
                 enable_eplb=self.enable_eplb,
-                eplb_record_metrics=self.eplb_record_metrics,
+                eplb_static=self.eplb_static,
                 expert_load_view=self.expert_load_view,
                 logical_to_physical_map=self.logical_to_physical_map,
                 logical_replica_count=self.logical_replica_count,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -614,7 +614,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -512,7 +512,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -439,7 +439,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -1019,7 +1019,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -1288,7 +1288,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -1650,7 +1650,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -1914,7 +1914,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -2238,7 +2238,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -154,7 +154,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -1127,7 +1127,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -1204,7 +1204,7 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             enable_eplb=enable_eplb,
-            eplb_record_metrics=eplb_record_metrics,
+            eplb_static=eplb_static,
             expert_map=expert_map,
             expert_load_view=expert_load_view,
             logical_to_physical_map=logical_to_physical_map,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
@@ -581,7 +581,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -739,7 +739,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -451,7 +451,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -599,7 +599,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -1622,7 +1622,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -376,7 +376,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -877,7 +877,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -351,7 +351,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
@@ -627,7 +627,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
@@ -373,7 +373,7 @@ def apply(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        eplb_record_metrics: bool = False,
+        eplb_static: bool = False,
         expert_load_view: torch.Tensor | None = None,
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -329,7 +329,7 @@ def __init__(
             else self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
-            eplb_record_metrics=eplb_config.record_metrics,
+            eplb_static=eplb_config.static,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             n_shared_experts=config.n_shared_experts
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
@@ -196,7 +196,7 @@ def __init__(
             routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
-            eplb_record_metrics=eplb_config.eplb_record_metrics,
+            eplb_static=eplb_config.static,
             num_redundant_experts=self.n_redundant_experts,
         )
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -140,7 +140,7 @@ def __init__(
             dp_size=dp_size,
             prefix=f"{prefix}.experts",
             enable_eplb=self.enable_eplb,
-            eplb_record_metrics=parallel_config.eplb_config.record_metrics,
+            eplb_static=parallel_config.eplb_config.static,
             num_redundant_experts=self.n_redundant_experts,
         )
 
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
@@ -170,7 +170,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             enable_eplb=self.enable_eplb,
-            eplb_record_metrics=eplb_config.record_metrics,
+            eplb_static=eplb_config.static,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             routing_method_type=RoutingMethodType.Renormalize,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
@@ -171,7 +171,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             enable_eplb=self.enable_eplb,
-            eplb_record_metrics=eplb_config.record_metrics,
+            eplb_static=eplb_config.static,
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             routing_method_type=RoutingMethodType.Renormalize,
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
@@ -236,7 +236,7 @@ def forward(self, *args, **kwargs):
         # Expert parallel load balancing kwargs
         enable_eplb = self.parallel_config.enable_eplb
         num_redundant_experts = self.parallel_config.eplb_config.num_redundant_experts
-        eplb_record_metrics = self.parallel_config.eplb_config.record_metrics
+        eplb_static = self.parallel_config.eplb_config.static
 
         # MixtureOfExperts mixin settings
         ep_size = get_ep_group().world_size
@@ -293,7 +293,7 @@ def _recursive_replace(module: nn.Module, prefix: str):
                         prefix=qual_name,
                         activation=activation,
                         enable_eplb=enable_eplb,
-                        eplb_record_metrics=eplb_record_metrics,
+                        eplb_static=eplb_static,
                         num_redundant_experts=num_redundant_experts,
                         has_bias=has_bias,
                         expert_mapping=expert_mapping,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py

Original file line number	Diff line number	Diff line change
`@@ -415,8 +415,6 @@ class EngineArgs:`
`415`	`415`	`eplb_window_size: int = EPLBConfig.window_size`
`416`	`416`	`eplb_step_interval: int = EPLBConfig.step_interval`
`417`	`417`	`eplb_log_balancedness: bool = EPLBConfig.log_balancedness`
`418`		`- eplb_save_dir: Path \| None = EPLBConfig.save_dir`
`419`		`- eplb_load_path: Path \| None = EPLBConfig.load_path`
`420`	`418`	`max_parallel_loading_workers: int \| None = (`
`421`	`419`	`ParallelConfig.max_parallel_loading_workers`
`422`	`420`	`)`