Skip to content

Commit 26b9aaa

Browse files
author
Varun Sundar Rabindranath
committed
cleanup
Signed-off-by: Varun Sundar Rabindranath <[email protected]>
1 parent 4566231 commit 26b9aaa

File tree

6 files changed

+33
-78
lines changed

6 files changed

+33
-78
lines changed

tests/distributed/eplb_utils.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
import torch
54
import multiprocessing
65
import os
7-
from vllm.utils.system_utils import update_environment_variables
86
import random
97

8+
import torch
9+
1010
from vllm.distributed.parallel_state import (
1111
init_distributed_environment,
1212
)
13+
from vllm.utils.system_utils import update_environment_variables
14+
1315

14-
def distributed_run(fn, world_size, fn_kwargs = None):
16+
def distributed_run(fn, world_size):
1517
number_of_processes = world_size
1618
processes: list[multiprocessing.Process] = []
1719
for i in range(number_of_processes):
@@ -22,7 +24,7 @@ def distributed_run(fn, world_size, fn_kwargs = None):
2224
env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
2325
env["MASTER_ADDR"] = "localhost"
2426
env["MASTER_PORT"] = "12345"
25-
p = multiprocessing.Process(target=fn, args=(env, fn_kwargs if fn_kwargs is not None else {}))
27+
p = multiprocessing.Process(target=fn, args=(env,))
2628
processes.append(p)
2729
p.start()
2830

@@ -37,7 +39,7 @@ def worker_fn_wrapper(fn):
3739
# `multiprocessing.Process` cannot accept environment variables directly
3840
# so we need to pass the environment variables as arguments
3941
# and update the environment variables in the function
40-
def wrapped_fn(env, fn_kwargs):
42+
def wrapped_fn(env):
4143
update_environment_variables(env)
4244
local_rank = os.environ["LOCAL_RANK"]
4345
device = torch.device(f"cuda:{local_rank}")
@@ -48,6 +50,6 @@ def wrapped_fn(env, fn_kwargs):
4850
random.seed(42)
4951
torch.manual_seed(42)
5052

51-
fn(**fn_kwargs)
53+
fn()
5254

53-
return wrapped_fn
55+
return wrapped_fn

tests/distributed/test_eplb_execute.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
ensure_model_parallel_initialized,
1313
get_tp_group,
1414
)
15+
1516
from .eplb_utils import distributed_run, worker_fn_wrapper
1617

18+
1719
def create_expert_indices_with_redundancy(
1820
num_layers: int,
1921
num_logical_experts: int,
@@ -107,11 +109,9 @@ def create_redundancy_config(
107109
num_logical_experts: int,
108110
num_physical_experts: int,
109111
) -> list[int]:
110-
# num logical 3
111-
# num physical 4
112112
"""Create a redundancy configuration."""
113-
redundancy_config = [1] * num_logical_experts # [1, 1, 1]
114-
remaining = num_physical_experts - num_logical_experts # remaining 1
113+
redundancy_config = [1] * num_logical_experts
114+
remaining = num_physical_experts - num_logical_experts
115115
# Randomly assign the remaining physical experts to the logical experts
116116
for _ in range(remaining):
117117
redundancy_config[random.choice(range(num_logical_experts))] += 1
@@ -237,21 +237,21 @@ def verify_redundant_experts_have_same_weights(
237237
# 2 GPU, 2 experts per GPU
238238
# 3 logical experts, 4 physical experts, 1 redundant experts
239239
(2, 1, 2, 3),
240-
## 2 GPU, 3 experts per GPU
241-
## 4 logical experts, 6 physical experts, 2 redundant experts
242-
#(2, 2, 3, 4),
243-
## 2 GPU, 8 experts per GPU
244-
## 16 logical experts, 16 physical experts, 0 redundant experts
245-
#(2, 4, 8, 16),
246-
## 4 GPU, 2 experts per GPU
247-
## 6 logical experts, 8 physical experts, 2 redundant experts
248-
#(4, 1, 2, 6),
249-
## 4 GPU, 2 experts per GPU
250-
## 5 logical experts, 8 physical experts, 3 redundant experts
251-
#(4, 2, 2, 5),
252-
## 4 GPU, 8 experts per GPU
253-
## 16 logical experts, 32 physical experts, 16 redundant experts
254-
#(4, 8, 8, 16),
240+
# 2 GPU, 3 experts per GPU
241+
# 4 logical experts, 6 physical experts, 2 redundant experts
242+
(2, 2, 3, 4),
243+
# 2 GPU, 8 experts per GPU
244+
# 16 logical experts, 16 physical experts, 0 redundant experts
245+
(2, 4, 8, 16),
246+
# 4 GPU, 2 experts per GPU
247+
# 6 logical experts, 8 physical experts, 2 redundant experts
248+
(4, 1, 2, 6),
249+
# 4 GPU, 2 experts per GPU
250+
# 5 logical experts, 8 physical experts, 3 redundant experts
251+
(4, 2, 2, 5),
252+
# 4 GPU, 8 experts per GPU
253+
# 16 logical experts, 32 physical experts, 16 redundant experts
254+
(4, 8, 8, 16),
255255
],
256256
)
257257
def test_rearrange_expert_weights_with_redundancy(
@@ -282,7 +282,6 @@ def worker_fn():
282282
redundancy_config = create_redundancy_config(
283283
num_logical_experts, total_physical_experts
284284
)
285-
print (f"redundancy config : {redundancy_config}")
286285

287286
old_indices = create_expert_indices_with_redundancy(
288287
num_layers,

tests/distributed/test_eplb_fused_moe_layer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,6 @@ def worker_fn():
247247

248248
rearrange_expert_weights_inplace(
249249
indices,
250-
# indices,
251250
shuffled_indices,
252251
rank_expert_weights,
253252
ep_group,

tools/ep_kernels/install_python_libraries.sh

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,19 @@ if [ ! -d "$WORKSPACE" ]; then
1212
fi
1313

1414
# configurable pip command (default: pip3)
15-
#PIP_CMD=${PIP_CMD:-pip3}
16-
PIP_CMD="uv pip"
15+
PIP_CMD=${PIP_CMD:-pip3}
1716
CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
18-
CUDACXX=${CUDA_HOME}/bin/nvcc
19-
export CUDACXX=${CUDACXX}
20-
export CUDA_PATH=${CUDA_HOME}
2117

2218
# install dependencies if not installed
23-
$PIP_CMD install cmake torch ninja --torch-backend=cu129
19+
$PIP_CMD install cmake torch ninja
2420

2521
# build nvshmem
2622
pushd $WORKSPACE
2723
mkdir -p nvshmem_src
2824
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
2925
tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
3026
pushd nvshmem_src
31-
#wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
27+
wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
3228
git init
3329
git apply -vvv nvshmem.patch
3430

@@ -67,7 +63,6 @@ cmake --build $WORKSPACE/nvshmem_build/ --target install
6763
popd
6864

6965
export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
70-
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
7166

7267
is_git_dirty() {
7368
local dir=$1
@@ -120,13 +115,13 @@ clone_repo() {
120115
pushd $WORKSPACE
121116
clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
122117
cd pplx-kernels
123-
$PIP_CMD install --no-build-isolation -vvv -e . --torch-backend=cu129
118+
$PIP_CMD install --no-build-isolation -vvv -e .
124119
popd
125120

126121
# build and install deepep, require pytorch installed
127122
pushd $WORKSPACE
128123
clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
129124
cd DeepEP
130125
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
131-
$PIP_CMD install --no-build-isolation -vvv -e . --torch-backend=cu129
126+
$PIP_CMD install --no-build-isolation -vvv -e .
132127
popd

vllm/distributed/eplb/rebalance_execute.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,6 @@ def shuffle_layer(
112112
"""
113113
Perform expert weights rearrangement of one layer.
114114
"""
115-
is_debug = ep_rank == 1
116-
117115
local2global = partial(
118116
idx_local_to_global,
119117
local_cnt=num_local_experts,
@@ -126,15 +124,6 @@ def shuffle_layer(
126124
for i in range(num_local_experts)
127125
]
128126

129-
def describe(t):
130-
return f"{t.size()} {t.stride()}"
131-
132-
if is_debug:
133-
print (f"old experts : {old_indices}")
134-
print (f"new experts : {new_indices}")
135-
print (f"unchanged : {is_unchanged}")
136-
137-
138127
# 1. Perform weight copy inside the local rank.
139128
is_received_locally = is_unchanged[:]
140129
for src in range(num_local_experts):
@@ -148,8 +137,6 @@ def describe(t):
148137
if old_indices[src_global] == new_indices[dst_global]:
149138
is_received_locally[dst] = True
150139
for weight, buffer in zip(expert_weights, expert_weights_buffer):
151-
if is_debug:
152-
print (f" - receive locally : buffer[{dst}] {describe(buffer[dst])} <- weight[{src}] {describe(weight[src])}")
153140
buffer[dst].copy_(weight[src])
154141

155142
p2p_ops: list[P2POp] = []
@@ -286,25 +273,6 @@ def rearrange_expert_weights_inplace(
286273
communications to reserve enough memory for the buffers.
287274
rank_mapping: A dictionary mapping old rank to new rank.
288275
"""
289-
290-
#old_global_expert_indices: torch.Tensor,
291-
#new_global_expert_indices: torch.Tensor,
292-
#expert_weights: Sequence[Iterable[torch.Tensor]],
293-
#ep_group: ProcessGroup,
294-
#is_profile: bool = False,
295-
#rank_mapping: dict[int, int] | None = None,
296-
297-
is_debug = ep_group.rank() == 1
298-
299-
if is_debug and False:
300-
s = "Rearrange_expert_weights_in_place: \n"
301-
s += f" - old_global_expert_indices : {old_global_expert_indices} \n"
302-
s += f" - new_global_expert_indices : {new_global_expert_indices} \n"
303-
s += f" - expert_weights : #{len(expert_weights)} tensors \n"
304-
s += f" - is_profile : {is_profile} \n"
305-
s += f" - rank_mapping : {rank_mapping} \n"
306-
print (s)
307-
308276
if rank_mapping is not None:
309277
if len(rank_mapping) == ep_group.size():
310278
# scale down
@@ -325,9 +293,6 @@ def rearrange_expert_weights_inplace(
325293
num_moe_layers, num_physical_experts = old_global_expert_indices.shape
326294
assert len(expert_weights) == num_moe_layers
327295

328-
if is_debug:
329-
print (f"num_moe_layers : {num_moe_layers} / num_physical_experts: {num_physical_experts} ")
330-
331296
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
332297
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
333298

@@ -364,8 +329,6 @@ def rearrange_expert_weights_inplace(
364329
torch.cuda.synchronize()
365330

366331
for layer in range(num_moe_layers):
367-
if is_debug:
368-
print (f"shuffling layer : {layer} ...")
369332
shuffle_layer(
370333
num_local_physical_experts,
371334
ep_rank,

vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -382,9 +382,6 @@ def apply(
382382
topk=topk_ids.size(-1),
383383
)
384384

385-
assert self.w1_scale is not None
386-
assert not self.w1_scale.is_contiguous()
387-
388385
fp8_m_grouped_gemm_nt_masked(
389386
(a1q, a1q_scale),
390387
(w1, self.w1_scale),

0 commit comments

Comments
 (0)