cosmetic changes

bnellnm · bnellnm · commit 6c0e0855cfc3 · 2025-05-14T14:54:24.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -50,8 +50,8 @@ def _valid_deep_gemm(hidden_states: torch.Tensor,
         logger.debug("DeepGemm disabled: expert map NYI.")
         return False
 
-    M = hidden_states.shape[0]
-    _, K, N = w2.shape
+    M = hidden_states.size(0)
+    _, K, N = w2.size()
     if not _valid_deep_gemm_shape(M, N, K):
         logger.debug("DeepGemm disabled: unalinged problem size.")
         return False
@@ -113,10 +113,10 @@ def apply(
         import deep_gemm as dg
 
         a1q = hidden_states
-        _, N, K = w1.shape
+        _, N, K = w1.size()
 
         assert global_num_experts != -1
-        assert w2.shape[1] == K
+        assert w2.size(1) == K
 
         a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute(
             a1q,
@@ -128,7 +128,7 @@ def apply(
         )
 
         # Note: M_sum is different than the pre-permuted shape of a1q.
-        M_sum = a1q.shape[0]
+        M_sum = a1q.size(0)
         workspace1 = _resize_cache(workspace13, (M_sum, N))
         workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
         workspace3 = _resize_cache(workspace13, (M_sum, K))
diff --git a/vllm/model_executor/layers/fused_moe/dispatch_combine.py b/vllm/model_executor/layers/fused_moe/dispatch_combine.py
@@ -35,7 +35,7 @@ def dispatch(
         apply_router_weight_on_input: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         if apply_router_weight_on_input:
-            topk = topk_ids.shape[1]
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, \
                 "apply_router_weight_on_input is only implemented for topk=1"
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -395,7 +395,7 @@ def invoke_moe_batched_triton_kernel(
     assert max_num_tokens % BLOCK_M == 0
 
     grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
-            triton.cdiv(B.shape[1], BLOCK_N))
+            triton.cdiv(B.size(1), BLOCK_N))
 
     batched_triton_kernel[grid](
         A,
@@ -493,17 +493,17 @@ def dispatch(
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         assert a1.dim() == 2
         assert topk_ids.dim() == 2
-        assert topk_ids.shape[0] == a1.shape[0]
+        assert topk_ids.size(0) == a1.size(0)
 
         if apply_router_weight_on_input:
-            topk = topk_ids.shape[1]
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, \
                 "apply_router_weight_on_input is only implemented for topk=1"
             a1.mul_(topk_weights.to(a1.dtype))
 
-        num_tokens, hidden_dim = a1.shape
-        topk = topk_ids.shape[1]
+        num_tokens, hidden_dim = a1.size()
+        topk = topk_ids.size(1)
 
         if self.max_num_tokens is None:
             tokens_per_expert = torch.bincount(topk_ids.view(-1),
@@ -543,10 +543,10 @@ def combine(
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
     ) -> None:
-        num_tokens = topk_ids.shape[0]
-        num_local_experts = fused_expert_output.shape[0]
-        K = fused_expert_output.shape[-1]
-        assert output.shape[0] == num_tokens and output.shape[1] == K
+        num_tokens = topk_ids.size(0)
+        num_local_experts = fused_expert_output.size(0)
+        K = fused_expert_output.size(-1)
+        assert output.size(0) == num_tokens and output.size(1) == K
 
         output.fill_(0)
 
@@ -559,7 +559,7 @@ def combine(
             rows = torch.count_nonzero(topks)
             rhs = fused_expert_output[expert_id - first_expert, :rows, :]
             if not apply_router_weight_on_input:
-                rhs.mul_(topk_weights[topkws].view(rhs.shape[0], 1))
+                rhs.mul_(topk_weights[topkws].view(rhs.size(0), 1))
             output[topks] = output[topks] + rhs
 
 
@@ -599,8 +599,8 @@ def workspace_shapes(
     ) -> Tuple[int, int, torch.dtype]:
         assert a.dim() == 2
         num_dp = self.world_size // self.dp_size
-        max_num_tokens = a.shape[
-            0] if self.max_num_tokens is None else self.max_num_tokens
+        max_num_tokens = a.size(
+            0) if self.max_num_tokens is None else self.max_num_tokens
         #print(f"WORKSPACE {max_num_tokens} {num_dp}")
         workspace13 = num_experts * max_num_tokens * num_dp * K
         workspace2 = max_num_tokens * num_dp * N
@@ -627,27 +627,27 @@ def apply(
     ) -> torch.Tensor:
         assert hidden_states.dim() == 3
         assert expert_num_tokens is not None
-        hidden_dim = hidden_states.shape[-1]
+        hidden_dim = hidden_states.size(-1)
 
         if self.max_num_tokens is None:
-            max_num_tokens = hidden_states.shape[1]
+            max_num_tokens = hidden_states.size(1)
         else:
             max_num_tokens = self.max_num_tokens
 
         num_dp = self.world_size // self.dp_size
         num_experts = global_num_experts
         out = _resize_cache(workspace13,
                             (num_experts, max_num_tokens * num_dp, hidden_dim))
-        num_local_experts = w1.shape[0]
-        assert num_local_experts == w1.shape[
-            0], f"{num_local_experts} == {w1.shape[0]}"
+        num_local_experts = w1.size(0)
+        assert num_local_experts == w1.size(0), (
+            f"{num_local_experts} == {w1.size(0)}")
 
-        N = w1.shape[1] // 2
+        N = w1.size(1) // 2
 
         # Not cudagraph friendly
         assert (torch.cuda.is_current_stream_capturing()
-                or torch.all(expert_num_tokens <= max_num_tokens)), (
-                    f"{expert_num_tokens} <= {max_num_tokens}")
+                or torch.all(expert_num_tokens <= max_num_tokens * num_dp)), (
+                    f"{expert_num_tokens} <= {max_num_tokens * num_dp}")
 
         for expert in range(num_local_experts):
             # Indexing expert_num_tokens doesn't work w/cudagraphs
@@ -699,8 +699,8 @@ def workspace_shapes(
     ) -> Tuple[int, int, torch.dtype]:
         assert a.dim() == 2
         num_dp = self.world_size // self.dp_size
-        max_num_tokens = a.shape[
-            0] if self.max_num_tokens is None else self.max_num_tokens
+        max_num_tokens = a.size(
+            0) if self.max_num_tokens is None else self.max_num_tokens
         workspace13 = num_experts * max_num_tokens * num_dp * max(K, N)
         workspace2 = num_experts * max_num_tokens * num_dp * (N // 2)
         return (workspace13, workspace2, a.dtype)
@@ -726,12 +726,12 @@ def apply(
     ) -> torch.Tensor:
         # Check constraints.
         if self.use_int4_w4a16:
-            assert hidden_states.shape[-1] // 2 == w1.shape[
-                2], "Hidden size mismatch"
+            assert hidden_states.size(-1) // 2 == w1.size(2), (
+                "Hidden size mismatch")
         else:
-            assert hidden_states.shape[-1] == w1.shape[2], \
-                (f"Hidden size mismatch {hidden_states.shape[-1]} "
-                 f"!= {w1.shape[2]}")
+            assert hidden_states.size(-1) == w1.size(2), (
+                f"Hidden size mismatch {hidden_states.size(-1)} "
+                f"!= {w1.size(2)}")
 
         assert hidden_states.is_contiguous(
         ), "Hidden_states must be contiguous"
@@ -745,17 +745,17 @@ def apply(
         E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
             hidden_states, w1, w2, topk_ids)
 
-        assert w1.shape[0] == E
-        assert w2.shape[0] == E
+        assert w1.size(0) == E
+        assert w2.size(0) == E
 
         config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8,
                                             use_int8_w8a16=self.use_int8_w8a16,
                                             use_int4_w4a16=self.use_int4_w4a16,
                                             dtype=hidden_states.dtype)
 
         config = try_get_optimal_moe_config(
-            w1.shape,
-            w2.shape,
+            w1.size(),
+            w2.size(),
             top_k_num,
             config_dtype,
             num_tokens,
@@ -797,13 +797,13 @@ def apply(
                                          config=config,
                                          block_shape=self.block_shape)
 
-        # Fix activations
-        if True:
-            assert activation == "silu"
+        if activation == "silu":
             invoke_batched_silu_and_mul(output=intermediate_cache2,
                                         input=intermediate_cache1,
                                         expert_num_tokens=expert_num_tokens)
         else:
+            # TODO: would be nice to use expert_num_tokens here to reduce
+            # garbage compute
             self.activation(activation, intermediate_cache2.view(-1, N // 2),
                             intermediate_cache1.view(-1, N))
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1576,12 +1576,12 @@ def apply(
     ) -> torch.Tensor:
         # Check constraints.
         if self.use_int4_w4a16:
-            assert hidden_states.shape[-1] // 2 == w1.shape[
-                2], "Hidden size mismatch"
+            assert hidden_states.size(-1) // 2 == w1.size(2), (
+                "Hidden size mismatch")
         else:
-            assert hidden_states.shape[-1] == w1.shape[2], \
-                (f"Hidden size mismatch {hidden_states.shape[-1]} "
-                 f"!= {w1.shape[2]}")
+            assert hidden_states.size(-1) == w1.size(2), \
+                (f"Hidden size mismatch {hidden_states.size(-1)} "
+                 f"!= {w1.size(2)}")
 
         assert hidden_states.is_contiguous(
         ), "Hidden_states must be contiguous"
@@ -1637,9 +1637,9 @@ def apply(
                 moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'],
                                      global_num_experts, expert_map))
         else:
-            max_num_tokens = hidden_states.shape[1]
+            max_num_tokens = hidden_states.size(1)
             sorted_token_ids = torch.arange(0,
-                                            hidden_states.shape[0] *
+                                            hidden_states.size(0) *
                                             max_num_tokens,
                                             device=hidden_states.device,
                                             dtype=torch.int)
@@ -1655,7 +1655,7 @@ def apply(
                                                  device=hidden_states.device,
                                                  dtype=torch.int32)
             num_tokens_post_padded.fill_(max_num_tokens)
-            hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+            hidden_states = hidden_states.view(-1, hidden_states.size(-1))
 
         invoke_fused_moe_kernel(hidden_states,
                                 w1,
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -57,21 +57,21 @@ def _moe_problem_size(
     to be kept in mind.
     """
     assert w1.dim() == 3 and w2.dim() == 3
-    E, N, _ = w1.shape
-    K = w2.shape[1]
+    E, N, _ = w1.size()
+    K = w2.size(1)
 
     if a1.dim() == 2:
         # Make sure we are using the correct a1 (pre-permute).
-        assert topk_ids.shape[0] == a1.shape[0], \
-            f"{topk_ids.shape[0]} != {a1.shape[0]}"
-        M = a1.shape[0]
+        assert topk_ids.size(0) == a1.size(0), \
+            f"{topk_ids.size(0)} != {a1.size(0)}"
+        M = a1.size(0)
     else:
         assert a1.dim() == 3
-        assert a1.shape[0] == E, f"{a1.shape[0]} == {E}"
-        M = a1.shape[1]  # This is max_num_tokens
+        assert a1.size(0) == E, f"{a1.size(0)} == {E}"
+        M = a1.size(1)  # This is max_num_tokens
 
     assert topk_ids.dim() == 2
-    topk = topk_ids.shape[1]
+    topk = topk_ids.size(1)
 
     return E, M, N, K, topk
 
@@ -171,7 +171,7 @@ def workspace_shapes(
 
     def activation(self, activation: str, output: torch.Tensor,
                    input: torch.Tensor) -> None:
-        assert output.shape[-1] * 2 == input.shape[-1]
+        assert output.size(-1) * 2 == input.size(-1)
         if activation == "silu":
             torch.ops._C.silu_and_mul(output, input)
         elif activation == "gelu":
@@ -320,18 +320,18 @@ def forward(
         if global_num_experts == -1:
             global_num_experts = E
 
-        output = a1 if inplace else torch.empty_like(a1)
+        output = a1 if inplace else torch.zeros_like(a1)
 
         workspace13_shape, workspace2_shape, workspace_dtype = (
             self.fused_experts.workspace_shapes(a1, M, N, K, top_k,
                                                 global_num_experts))
 
         # We can reuse the memory between cache1 and cache3 because by the time
         # we need cache3, we're done with cache1
-        workspace13 = torch.empty(workspace13_shape,
+        workspace13 = torch.zeros(workspace13_shape,
                                   device=a1.device,
                                   dtype=workspace_dtype)
-        workspace2 = torch.empty(workspace2_shape,
+        workspace2 = torch.zeros(workspace2_shape,
                                  device=a1.device,
                                  dtype=workspace_dtype)
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -22,9 +22,9 @@ def _moe_permute(
     Determine the sorted_token_ids, expert_ids for the given problem size.
     Permute the hidden states and scales according to `sorted_token_ids`.
     """
-    top_k_num = curr_topk_ids.shape[1]
+    top_k_num = curr_topk_ids.size(1)
 
-    tokens_in_chunk, _ = curr_hidden_states.shape
+    tokens_in_chunk = curr_hidden_states.sizze(0)
 
     sorted_token_ids, expert_ids, num_tokens_post_padded = (
         moe_align_block_size(curr_topk_ids,
@@ -62,8 +62,8 @@ def _moe_unpermute_and_reduce(
     Unpermute the final result and apply topk_weights, then perform the final
     reduction on the hidden states.
     """
-    M, topk = topk_weight.shape
-    K = curr_hidden.shape[-1]
+    M, topk = topk_weight.size()
+    K = curr_hidden.size(-1)
     if inv_perm is not None:
         curr_hidden = curr_hidden[inv_perm, ...]
     curr_hidden = curr_hidden.view(-1, topk, K)
@@ -110,7 +110,7 @@ def moe_permute(
     - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
     the group which the j-th row of the LHS belong to.`
     """
-    n_token, n_hidden = hidden_states.shape
+    n_token, n_hidden = hidden_states.size()
     assert (n_hidden * hidden_states.element_size()
             ) % 16 == 0, "permue kernel need hidden dim align to 16B"
     permuted_row_size = n_token * topk
@@ -170,7 +170,7 @@ def moe_unpermute(
     - hidden_states (torch.Tensor): The reduced and unpermuted activation
       tensor.
     """
-    n_token, n_hidden = topk_weights.shape[0], permuted_hidden_states.shape[-1]
+    n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1)
     assert (n_hidden * permuted_hidden_states.element_size()
             ) % 16 == 0, "unpermue kernel need hidden dim align to 16B"
     hidden_states = torch.empty((n_token, n_hidden),
diff --git a/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py b/vllm/model_executor/layers/fused_moe/pplx_dispatch_combine.py
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py