cleanup

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit 26b9aaa415d4 · 2025-11-21T06:28:40.000Z
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import torch
 import multiprocessing
 import os
-from vllm.utils.system_utils import update_environment_variables
 import random
 
+import torch
+
 from vllm.distributed.parallel_state import (
     init_distributed_environment,
 )
+from vllm.utils.system_utils import update_environment_variables
+
 
-def distributed_run(fn, world_size, fn_kwargs = None):
+def distributed_run(fn, world_size):
     number_of_processes = world_size
     processes: list[multiprocessing.Process] = []
     for i in range(number_of_processes):
@@ -22,7 +24,7 @@ def distributed_run(fn, world_size, fn_kwargs = None):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env, fn_kwargs if fn_kwargs is not None else {}))
+        p = multiprocessing.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
@@ -37,7 +39,7 @@ def worker_fn_wrapper(fn):
     # `multiprocessing.Process` cannot accept environment variables directly
     # so we need to pass the environment variables as arguments
     # and update the environment variables in the function
-    def wrapped_fn(env, fn_kwargs):
+    def wrapped_fn(env):
         update_environment_variables(env)
         local_rank = os.environ["LOCAL_RANK"]
         device = torch.device(f"cuda:{local_rank}")
@@ -48,6 +50,6 @@ def wrapped_fn(env, fn_kwargs):
         random.seed(42)
         torch.manual_seed(42)
 
-        fn(**fn_kwargs)
+        fn()
 
-    return wrapped_fn
+    return wrapped_fn
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
@@ -12,8 +12,10 @@
     ensure_model_parallel_initialized,
     get_tp_group,
 )
+
 from .eplb_utils import distributed_run, worker_fn_wrapper
 
+
 def create_expert_indices_with_redundancy(
     num_layers: int,
     num_logical_experts: int,
@@ -107,11 +109,9 @@ def create_redundancy_config(
     num_logical_experts: int,
     num_physical_experts: int,
 ) -> list[int]:
-    # num logical 3
-    # num physical 4 
     """Create a redundancy configuration."""
-    redundancy_config = [1] * num_logical_experts # [1, 1, 1]
-    remaining = num_physical_experts - num_logical_experts # remaining 1
+    redundancy_config = [1] * num_logical_experts
+    remaining = num_physical_experts - num_logical_experts
     # Randomly assign the remaining physical experts to the logical experts
     for _ in range(remaining):
         redundancy_config[random.choice(range(num_logical_experts))] += 1
@@ -237,21 +237,21 @@ def verify_redundant_experts_have_same_weights(
         # 2 GPU, 2 experts per GPU
         # 3 logical experts, 4 physical experts, 1 redundant experts
         (2, 1, 2, 3),
-        ## 2 GPU, 3 experts per GPU
-        ## 4 logical experts, 6 physical experts, 2 redundant experts
-        #(2, 2, 3, 4),
-        ## 2 GPU, 8 experts per GPU
-        ## 16 logical experts, 16 physical experts, 0 redundant experts
-        #(2, 4, 8, 16),
-        ## 4 GPU, 2 experts per GPU
-        ## 6 logical experts, 8 physical experts, 2 redundant experts
-        #(4, 1, 2, 6),
-        ## 4 GPU, 2 experts per GPU
-        ## 5 logical experts, 8 physical experts, 3 redundant experts
-        #(4, 2, 2, 5),
-        ## 4 GPU, 8 experts per GPU
-        ## 16 logical experts, 32 physical experts, 16 redundant experts
-        #(4, 8, 8, 16),
+        # 2 GPU, 3 experts per GPU
+        # 4 logical experts, 6 physical experts, 2 redundant experts
+        (2, 2, 3, 4),
+        # 2 GPU, 8 experts per GPU
+        # 16 logical experts, 16 physical experts, 0 redundant experts
+        (2, 4, 8, 16),
+        # 4 GPU, 2 experts per GPU
+        # 6 logical experts, 8 physical experts, 2 redundant experts
+        (4, 1, 2, 6),
+        # 4 GPU, 2 experts per GPU
+        # 5 logical experts, 8 physical experts, 3 redundant experts
+        (4, 2, 2, 5),
+        # 4 GPU, 8 experts per GPU
+        # 16 logical experts, 32 physical experts, 16 redundant experts
+        (4, 8, 8, 16),
     ],
 )
 def test_rearrange_expert_weights_with_redundancy(
@@ -282,7 +282,6 @@ def worker_fn():
         redundancy_config = create_redundancy_config(
             num_logical_experts, total_physical_experts
         )
-        print (f"redundancy config : {redundancy_config}")
 
         old_indices = create_expert_indices_with_redundancy(
             num_layers,
diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -247,7 +247,6 @@ def worker_fn():
 
             rearrange_expert_weights_inplace(
                 indices,
-                # indices,
                 shuffled_indices,
                 rank_expert_weights,
                 ep_group,
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
@@ -12,23 +12,19 @@ if [ ! -d "$WORKSPACE" ]; then
 fi
 
 # configurable pip command (default: pip3)
-#PIP_CMD=${PIP_CMD:-pip3}
-PIP_CMD="uv pip"
+PIP_CMD=${PIP_CMD:-pip3}
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
-CUDACXX=${CUDA_HOME}/bin/nvcc
-export CUDACXX=${CUDACXX}
-export CUDA_PATH=${CUDA_HOME}
 
 # install dependencies if not installed
-$PIP_CMD install cmake torch ninja --torch-backend=cu129
+$PIP_CMD install cmake torch ninja
 
 # build nvshmem
 pushd $WORKSPACE
 mkdir -p nvshmem_src
 wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
 tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
 pushd nvshmem_src
-#wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
+wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
 git init
 git apply -vvv nvshmem.patch
 
@@ -67,7 +63,6 @@ cmake --build $WORKSPACE/nvshmem_build/ --target install
 popd
 
 export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
-export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
 
 is_git_dirty() {
     local dir=$1
@@ -120,13 +115,13 @@ clone_repo() {
 pushd $WORKSPACE
 clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
 cd pplx-kernels
-$PIP_CMD install --no-build-isolation -vvv -e . --torch-backend=cu129
+$PIP_CMD install --no-build-isolation -vvv -e .
 popd
 
 # build and install deepep, require pytorch installed
 pushd $WORKSPACE
 clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-$PIP_CMD install --no-build-isolation -vvv -e . --torch-backend=cu129
+$PIP_CMD install --no-build-isolation -vvv -e .
 popd
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
@@ -112,8 +112,6 @@ def shuffle_layer(
     """
     Perform expert weights rearrangement of one layer.
     """
-    is_debug = ep_rank == 1
-
     local2global = partial(
         idx_local_to_global,
         local_cnt=num_local_experts,
@@ -126,15 +124,6 @@ def shuffle_layer(
         for i in range(num_local_experts)
     ]
 
-    def describe(t):
-        return f"{t.size()} {t.stride()}"
-
-    if is_debug:
-        print (f"old experts : {old_indices}")
-        print (f"new experts : {new_indices}")
-        print (f"unchanged   : {is_unchanged}")
-
-
     # 1. Perform weight copy inside the local rank.
     is_received_locally = is_unchanged[:]
     for src in range(num_local_experts):
@@ -148,8 +137,6 @@ def describe(t):
             if old_indices[src_global] == new_indices[dst_global]:
                 is_received_locally[dst] = True
                 for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                    if is_debug:
-                        print (f"   - receive locally : buffer[{dst}] {describe(buffer[dst])} <- weight[{src}] {describe(weight[src])}")
                     buffer[dst].copy_(weight[src])
 
     p2p_ops: list[P2POp] = []
@@ -286,25 +273,6 @@ def rearrange_expert_weights_inplace(
             communications to reserve enough memory for the buffers.
         rank_mapping: A dictionary mapping old rank to new rank.
     """
-
-    #old_global_expert_indices: torch.Tensor,
-    #new_global_expert_indices: torch.Tensor,
-    #expert_weights: Sequence[Iterable[torch.Tensor]],
-    #ep_group: ProcessGroup,
-    #is_profile: bool = False,
-    #rank_mapping: dict[int, int] | None = None,
-
-    is_debug = ep_group.rank() == 1
-
-    if is_debug and False:
-        s = "Rearrange_expert_weights_in_place: \n"
-        s += f" - old_global_expert_indices : {old_global_expert_indices} \n"
-        s += f" - new_global_expert_indices : {new_global_expert_indices} \n"
-        s += f" - expert_weights : #{len(expert_weights)} tensors \n"
-        s += f" - is_profile : {is_profile} \n"
-        s += f" - rank_mapping : {rank_mapping} \n"
-        print (s)
-
     if rank_mapping is not None:
         if len(rank_mapping) == ep_group.size():
             # scale down
@@ -325,9 +293,6 @@ def rearrange_expert_weights_inplace(
     num_moe_layers, num_physical_experts = old_global_expert_indices.shape
     assert len(expert_weights) == num_moe_layers
 
-    if is_debug:
-        print (f"num_moe_layers : {num_moe_layers} / num_physical_experts: {num_physical_experts} ")
-
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
 
@@ -364,8 +329,6 @@ def rearrange_expert_weights_inplace(
     torch.cuda.synchronize()
 
     for layer in range(num_moe_layers):
-        if is_debug:
-            print (f"shuffling layer : {layer} ...")
         shuffle_layer(
             num_local_physical_experts,
             ep_rank,
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -382,9 +382,6 @@ def apply(
             topk=topk_ids.size(-1),
         )
 
-        assert self.w1_scale is not None
-        assert not self.w1_scale.is_contiguous()
-
         fp8_m_grouped_gemm_nt_masked(
             (a1q, a1q_scale),
             (w1, self.w1_scale),

Original file line number	Diff line number	Diff line change
`@@ -382,9 +382,6 @@ def apply(`
`382`	`382`	`topk=topk_ids.size(-1),`
`383`	`383`	`)`
`384`	`384`
`385`		`- assert self.w1_scale is not None`
`386`		`- assert not self.w1_scale.is_contiguous()`
`387`		`-`
`388`	`385`	`fp8_m_grouped_gemm_nt_masked(`
`389`	`386`	`(a1q, a1q_scale),`
`390`	`387`	`(w1, self.w1_scale),`