Skip to content

Commit c0b2ba1

Browse files
committed
merge main 09243
2 parents 6997a18 + f0a0a27 commit c0b2ba1

File tree

67 files changed

+4062
-329
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+4062
-329
lines changed

docker/Dockerfile.nixl

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
ARG CUDA_VERSION=12.6.1
2+
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
3+
ARG PYTHON_VERSION=3.10
4+
ARG MAMBA_VERSION=24.7.1-0
5+
ARG TARGETPLATFORM
6+
ENV PATH=/opt/conda/bin:$PATH \
7+
CONDA_PREFIX=/opt/conda
8+
9+
RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
10+
ca-certificates \
11+
libssl-dev \
12+
curl \
13+
g++ \
14+
make \
15+
git && \
16+
rm -rf /var/lib/apt/lists/*
17+
18+
RUN case ${TARGETPLATFORM} in \
19+
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
20+
*) MAMBA_ARCH=x86_64 ;; \
21+
esac && \
22+
curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
23+
bash ~/mambaforge.sh -b -p /opt/conda && \
24+
rm ~/mambaforge.sh
25+
26+
RUN case ${TARGETPLATFORM} in \
27+
"linux/arm64") exit 1 ;; \
28+
*) /opt/conda/bin/conda update -y conda && \
29+
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
30+
esac && \
31+
/opt/conda/bin/conda clean -ya
32+
33+
34+
WORKDIR /root
35+
36+
COPY ./requirements.txt /lightllm/requirements.txt
37+
RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
38+
39+
RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
40+
RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
41+
42+
RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
43+
RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
44+
45+
ENV CUDA_HOME=/usr/local/cuda \
46+
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
47+
48+
RUN mkdir -p /tmp/gdrcopy && cd /tmp \
49+
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
50+
&& cd gdrcopy/packages \
51+
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
52+
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
53+
&& cd / && rm -rf /tmp/gdrcopy
54+
55+
RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
56+
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
57+
rm -rf /usr/lib/ucx && \
58+
rm -rf /opt/hpcx/ucx && \
59+
cd /usr/local/src && \
60+
git clone https://github.com/openucx/ucx.git && \
61+
cd ucx && \
62+
git checkout v1.19.x && \
63+
./autogen.sh && ./configure \
64+
--enable-shared \
65+
--disable-static \
66+
--disable-doxygen-doc \
67+
--enable-optimizations \
68+
--enable-cma \
69+
--enable-devel-headers \
70+
--with-cuda=/usr/local/cuda \
71+
--with-verbs=yes \
72+
--with-dm \
73+
--with-gdrcopy=/usr/local \
74+
--with-efa \
75+
--enable-mt && \
76+
make -j && \
77+
make -j install-strip && \
78+
ldconfig;
79+
80+
RUN apt-get update && apt-get install -y pkg-config tmux net-tools ; \
81+
cd /usr/local/src; \
82+
pip install --upgrade meson pybind11 patchelf; \
83+
git clone https://github.com/ai-dynamo/nixl.git -b main && \
84+
cd nixl && \
85+
rm -rf build && \
86+
mkdir build && \
87+
meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
88+
cd build && \
89+
ninja && \
90+
ninja install && \
91+
cd .. && pip install . --no-deps;
92+
93+
COPY . /lightllm
94+
RUN pip install -e /lightllm --no-cache-dir

docker/Dockerfile.nixl.deepep

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
ARG CUDA_VERSION=12.6.1
2+
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
3+
4+
ARG PYTHON_VERSION=3.10
5+
ARG MAMBA_VERSION=24.7.1-0
6+
ARG TARGETPLATFORM
7+
8+
ENV PATH=/opt/conda/bin:$PATH \
9+
CONDA_PREFIX=/opt/conda
10+
11+
RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
12+
ca-certificates \
13+
libssl-dev \
14+
curl \
15+
g++ \
16+
make \
17+
git && \
18+
rm -rf /var/lib/apt/lists/*
19+
20+
RUN case ${TARGETPLATFORM} in \
21+
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
22+
*) MAMBA_ARCH=x86_64 ;; \
23+
esac && \
24+
curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
25+
bash ~/mambaforge.sh -b -p /opt/conda && \
26+
rm ~/mambaforge.sh
27+
28+
RUN case ${TARGETPLATFORM} in \
29+
"linux/arm64") exit 1 ;; \
30+
*) /opt/conda/bin/conda update -y conda && \
31+
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
32+
esac && \
33+
/opt/conda/bin/conda clean -ya
34+
35+
36+
WORKDIR /root
37+
38+
COPY ./requirements.txt /lightllm/requirements.txt
39+
RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
40+
41+
RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
42+
RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel && pip install --no-deps -v .
43+
44+
RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
45+
RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
46+
47+
ENV CUDA_HOME=/usr/local/cuda \
48+
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
49+
50+
RUN mkdir -p /tmp/gdrcopy && cd /tmp \
51+
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
52+
&& cd gdrcopy/packages \
53+
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
54+
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
55+
&& cd / && rm -rf /tmp/gdrcopy
56+
57+
# Fix DeepEP IBGDA symlink
58+
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
59+
60+
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
61+
&& tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
62+
&& cd nvshmem \
63+
&& rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
64+
&& NVSHMEM_SHMEM_SUPPORT=0 \
65+
NVSHMEM_UCX_SUPPORT=0 \
66+
NVSHMEM_USE_NCCL=0 \
67+
NVSHMEM_MPI_SUPPORT=0 \
68+
NVSHMEM_IBGDA_SUPPORT=1 \
69+
NVSHMEM_PMIX_SUPPORT=0 \
70+
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
71+
NVSHMEM_USE_GDRCOPY=1 \
72+
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
73+
&& cmake --build build --target install -j64
74+
75+
ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58
76+
RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..
77+
78+
WORKDIR /root/DeepEP
79+
ENV NVSHMEM_DIR=/root/nvshmem/install
80+
RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install
81+
82+
RUN apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
83+
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
84+
rm -rf /usr/lib/ucx && \
85+
rm -rf /opt/hpcx/ucx && \
86+
cd /usr/local/src && \
87+
git clone https://github.com/openucx/ucx.git && \
88+
cd ucx && \
89+
git checkout v1.19.x && \
90+
./autogen.sh && ./configure \
91+
--enable-shared \
92+
--disable-static \
93+
--disable-doxygen-doc \
94+
--enable-optimizations \
95+
--enable-cma \
96+
--enable-devel-headers \
97+
--with-cuda=/usr/local/cuda \
98+
--with-verbs=yes \
99+
--with-dm \
100+
--with-gdrcopy=/usr/local \
101+
--with-efa \
102+
--enable-mt && \
103+
make -j && \
104+
make -j install-strip && \
105+
ldconfig;
106+
107+
RUN apt-get update && apt-get install -y pkg-config tmux net-tools ; \
108+
cd /usr/local/src; \
109+
pip install --upgrade meson pybind11 patchelf; \
110+
git clone https://github.com/ai-dynamo/nixl.git -b main && \
111+
cd nixl && \
112+
rm -rf build && \
113+
mkdir build && \
114+
meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
115+
cd build && \
116+
ninja && \
117+
ninja install && \
118+
cd .. && pip install . --no-deps;
119+
120+
COPY . /lightllm
121+
RUN pip install -e /lightllm --no-cache-dir

lightllm/common/deepseek2_mem_manager.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from lightllm.common.kv_trans_kernel.kv_trans import kv_trans
99
from lightllm.common.kv_trans_kernel.kv_trans_v2 import kv_trans_v2_for_d_node, kv_trans_v2_for_p_node
1010
from lightllm.distributed.pynccl import PyNcclCommunicator
11+
from lightllm.common.kv_trans_kernel.nixl_kv_trans import mla_page_io
1112

1213
logger = init_logger(__name__)
1314

@@ -36,6 +37,57 @@ def alloc_kv_move_buffer(self, max_req_total_len):
3637
self.token_dim_size = self.kv_move_buffer.shape[-1] * self.kv_move_buffer.shape[-2]
3738
return
3839

40+
def alloc_paged_kv_move_buffer(self, page_num, page_size) -> torch.Tensor:
41+
self.kv_move_buffer = torch.empty(
42+
(page_num, page_size, self.layer_num, self.head_num, self.head_dim), dtype=self.dtype, device="cuda"
43+
)
44+
self._buffer_mem_indexes_tensors = [
45+
torch.empty((page_size,), dtype=torch.int64, device="cpu", pin_memory=True) for _ in range(page_num)
46+
]
47+
return self.kv_move_buffer
48+
49+
def write_mem_to_page_kv_move_buffer(
50+
self,
51+
mem_indexes: List[int],
52+
page_index: int,
53+
dp_index: int,
54+
mem_managers: List["MemoryManager"],
55+
dp_world_size: int,
56+
):
57+
cur_page = self.kv_move_buffer[page_index]
58+
pin_mem_indexes = self._buffer_mem_indexes_tensors[page_index][0 : len(mem_indexes)]
59+
pin_mem_indexes.numpy()[:] = mem_indexes
60+
mem_indexes_gpu = pin_mem_indexes.cuda(non_blocking=True)
61+
dp_mems = mem_managers[(dp_index * dp_world_size) : ((dp_index + 1) * dp_world_size)]
62+
mla_page_io(
63+
mem_indexes=mem_indexes_gpu,
64+
page_tensor=cur_page,
65+
kv_buffer=dp_mems[0].kv_buffer,
66+
mode="write",
67+
)
68+
return
69+
70+
def read_page_kv_move_buffer_to_mem(
71+
self,
72+
mem_indexes: List[int],
73+
page_index: int,
74+
dp_index: int,
75+
mem_managers: List["MemoryManager"],
76+
dp_world_size: int,
77+
):
78+
cur_page = self.kv_move_buffer[page_index]
79+
pin_mem_indexes = self._buffer_mem_indexes_tensors[page_index][0 : len(mem_indexes)]
80+
pin_mem_indexes.numpy()[:] = mem_indexes
81+
mem_indexes_gpu = pin_mem_indexes.cuda(non_blocking=True)
82+
dp_mems = mem_managers[(dp_index * dp_world_size) : ((dp_index + 1) * dp_world_size)]
83+
for mem in dp_mems:
84+
mla_page_io(
85+
mem_indexes=mem_indexes_gpu,
86+
page_tensor=cur_page,
87+
kv_buffer=mem.kv_buffer,
88+
mode="read",
89+
)
90+
3991
def send_to_decode_node(
4092
self,
4193
move_tasks: List[KVMoveTask],

lightllm/common/fused_moe/grouped_fused_moe.py

Lines changed: 93 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,91 @@ def moe_align1(
219219
)
220220

221221

222+
@triton.jit
223+
def moe_align_fused_kernel(
224+
topk_ids_ptr, # [token_num, topk]
225+
topk_weights_ptr, # [token_num, topk]
226+
expert_to_token_index_ptr, # [expert_num, token_num * topk]
227+
expert_to_weight_ptr, # [expert_num, token_num * topk]
228+
expert_token_num_ptr, # [expert_num]
229+
token_num,
230+
topk_num: tl.constexpr,
231+
BLOCK_SIZE: tl.constexpr,
232+
):
233+
token_block = tl.program_id(0)
234+
offs = token_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
235+
mask = offs < token_num * topk_num
236+
237+
expert_ids = tl.load(topk_ids_ptr + offs, mask=mask, other=0)
238+
weights = tl.load(topk_weights_ptr + offs, mask=mask, other=0.0)
239+
240+
# 用 atomic_add 给 expert 分配写位置
241+
write_pos = tl.atomic_add(expert_token_num_ptr + expert_ids, 1, mask=mask)
242+
243+
# 按 token 顺序写 index 和 weight
244+
tl.store(
245+
expert_to_token_index_ptr + expert_ids * (token_num * topk_num) + write_pos,
246+
offs,
247+
mask=mask,
248+
)
249+
tl.store(
250+
expert_to_weight_ptr + expert_ids * (token_num * topk_num) + write_pos,
251+
weights,
252+
mask=mask,
253+
)
254+
255+
256+
def _get_moe_align_fused_static_key(
257+
topk_weights: torch.Tensor,
258+
) -> dict:
259+
topk_num = topk_weights.shape[1]
260+
return {
261+
"topk_num": topk_num,
262+
}
263+
264+
265+
def _get_moe_align_fused_configs():
266+
return [
267+
{
268+
"BLOCK_SIZE": bt,
269+
"num_warps": nw,
270+
}
271+
for nw in [1, 2, 4, 8]
272+
for bt in [128, 256, 512, 1024, 2048]
273+
]
274+
275+
276+
@autotune(
277+
kernel_name="moe_align_fused:v1",
278+
configs_gen_func=_get_moe_align_fused_configs,
279+
static_key_func=_get_moe_align_fused_static_key,
280+
run_key_func=lambda topk_ids: topk_ids.shape[0],
281+
mutates_args=["expert_to_token_index", "expert_to_weight", "expert_token_num"],
282+
)
283+
def moe_align_fused(
284+
expert_to_token_index, expert_to_weight, expert_token_num, topk_ids, topk_weights, run_config: Optional[dict] = None
285+
):
286+
token_num, topk_num = topk_ids.shape
287+
if run_config is None:
288+
run_config = {}
289+
BLOCK_SIZE = run_config.get("BLOCK_SIZE", 256)
290+
num_warps = run_config.get("num_warps", 4)
291+
292+
grid = (triton.cdiv(token_num * topk_num, BLOCK_SIZE),)
293+
moe_align_fused_kernel[grid](
294+
topk_ids,
295+
topk_weights,
296+
expert_to_token_index,
297+
expert_to_weight,
298+
expert_token_num,
299+
token_num,
300+
topk_num,
301+
BLOCK_SIZE=BLOCK_SIZE,
302+
num_warps=num_warps,
303+
)
304+
return expert_to_token_index, expert_to_weight, expert_token_num
305+
306+
222307
@triton.jit
223308
def moe_align2_kernel(
224309
experts_token_num_ptr, # [expert_num,]
@@ -719,9 +804,14 @@ def fused_experts_impl(
719804

720805
expert_to_tokens = torch.empty((E, topk_num * tokens_in_chunk), dtype=torch.int32, device="cuda")
721806
expert_to_weights = torch.empty((E, topk_num * tokens_in_chunk), dtype=torch.float32, device="cuda")
722-
moe_align(topk_ids=curr_topk_ids, out=expert_to_tokens)
723-
expert_to_token_num = torch.empty((E,), dtype=torch.int32, device="cuda")
724-
moe_align1(expert_to_tokens, curr_topk_weights, expert_to_weights, expert_to_token_num, topk=topk_num)
807+
expert_to_token_num = torch.zeros((E,), dtype=torch.int32, device="cuda")
808+
moe_align_fused(
809+
expert_to_token_index=expert_to_tokens,
810+
expert_to_weight=expert_to_weights,
811+
expert_token_num=expert_to_token_num,
812+
topk_ids=curr_topk_ids,
813+
topk_weights=curr_topk_weights,
814+
)
725815

726816
reused_mblock_infos = grouped_matmul(
727817
curr_topk_ids.numel(),

0 commit comments

Comments
 (0)