cleanups + lint, layer.py wip

bnellnm · bnellnm · commit 9590b96831ca · 2025-05-07T15:37:14.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -504,8 +504,7 @@ def torch_pplx_moe(pgi, dp_size, a, w1, w2, scores, topk):
         chunk_by_rank(w2, rank, world_size),
         chunk_topk_weight,
         chunk_topk_ids,
-        global_num_experts=num_experts
-    )
+        global_num_experts=num_experts)
 
     torch.cuda.synchronize()
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -622,8 +622,7 @@ def __init__(
         assert quant_method is not None
         self.quant_method = quant_method
 
-        dispatch_combine = self._construct_dispatch_combine(
-            moe, quant_config)
+        dispatch_combine = self._construct_dispatch_combine(moe, quant_config)
 
         success = self.quant_method.set_dispatch_combine(dispatch_combine)
 
@@ -1030,13 +1029,12 @@ def forward(self, hidden_states: torch.Tensor,
             return torch.ops.vllm.moe_forward(hidden_states, router_logits,
                                               self.layer_name)
 
-
     def forward_impl_chunked(self, full_hidden_states: torch.Tensor,
                              full_router_logits: torch.Tensor):
 
         full_final_hidden_states = torch.empty_like(full_hidden_states)
 
-        def process_chunk(chunk_start, chunk_end, skip_result_store = False):
+        def process_chunk(chunk_start, chunk_end, skip_result_store=False):
             hidden_states = full_hidden_states[chunk_start:chunk_end, :]
             router_logits = full_router_logits[chunk_start:chunk_end, :]
 
@@ -1089,18 +1087,23 @@ def process_chunk(chunk_start, chunk_end, skip_result_store = False):
                 full_final_hidden_states[chunk_start:chunk_end, :].copy_(
                     final_hidden_states)
 
-        max_tokens_across_dp = get_forward_context().dp_metadata.max_tokens_across_dp
+        max_tokens_across_dp = get_forward_context(
+        ).dp_metadata.max_tokens_across_dp
         moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE // self.dp_size
 
         num_tokens = full_hidden_states.size(0)
-        for chunk_start_ in range(0, max_tokens_across_dp, moe_dp_chunk_size_per_rank):
-            chunk_start = chunk_start_ 
-            chunk_end =  min(chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dp)
+        for chunk_start_ in range(0, max_tokens_across_dp,
+                                  moe_dp_chunk_size_per_rank):
+            chunk_start = chunk_start_
+            chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank,
+                            max_tokens_across_dp)
             # clamp start and end
             chunk_start = min(chunk_start, num_tokens - 1)
             chunk_end = min(chunk_end, num_tokens)
 
-            process_chunk(chunk_start, chunk_end, skip_result_store = chunk_start_ >= num_tokens)
+            process_chunk(chunk_start,
+                          chunk_end,
+                          skip_result_store=chunk_start_ >= num_tokens)
 
         return full_final_hidden_states