ModelTC
diff --git a/‎docker/Dockerfile.nixl‎
Lines changed: 94 additions & 0 deletions b/‎docker/Dockerfile.nixl‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎docker/Dockerfile.nixl.deepep‎
Lines changed: 121 additions & 0 deletions b/‎docker/Dockerfile.nixl.deepep‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎lightllm/common/deepseek2_mem_manager.py‎
Lines changed: 52 additions & 0 deletions b/‎lightllm/common/deepseek2_mem_manager.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 93 additions & 3 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 93 additions & 3 deletions
@@ -0,0 +1,94 @@
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+ARG PYTHON_VERSION=3.10
+ARG MAMBA_VERSION=24.7.1-0
+ARG TARGETPLATFORM
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    g++ \
+    make \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+    *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  exit 1 ;; \
+    *)              /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+
+WORKDIR /root
+
+COPY ./requirements.txt /lightllm/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel &&  pip install --no-deps -v .
+
+RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
+RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
+
+ENV CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
+
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && cd gdrcopy/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
+
+RUN apt-get update && apt-get install -y cmake automake autotools-dev  libtool libz-dev && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
+    rm -rf /usr/lib/ucx && \
+    rm -rf /opt/hpcx/ucx && \
+    cd /usr/local/src && \
+    git clone https://github.com/openucx/ucx.git && \
+    cd ucx && 			     \
+    git checkout v1.19.x &&	     \
+    ./autogen.sh && ./configure     \
+    --enable-shared             \
+    --disable-static            \
+    --disable-doxygen-doc       \
+    --enable-optimizations      \
+    --enable-cma                \
+    --enable-devel-headers      \
+    --with-cuda=/usr/local/cuda \
+    --with-verbs=yes                \
+    --with-dm                   \
+    --with-gdrcopy=/usr/local   \
+    --with-efa                  \
+    --enable-mt &&              \
+    make -j &&                      \
+    make -j install-strip &&        \
+    ldconfig;
+
+RUN apt-get update && apt-get install -y  pkg-config tmux net-tools ;  \
+    cd /usr/local/src; \
+    pip install --upgrade meson pybind11 patchelf; \
+    git clone https://github.com/ai-dynamo/nixl.git -b main && \
+    cd nixl && \
+    rm -rf build && \
+    mkdir build && \
+    meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
+    cd build && \
+    ninja && \
+    ninja install && \
+    cd .. && pip install . --no-deps;
+
+COPY . /lightllm
+RUN pip install -e /lightllm --no-cache-dir
@@ -0,0 +1,121 @@
+ARG CUDA_VERSION=12.6.1
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
+
+ARG PYTHON_VERSION=3.10
+ARG MAMBA_VERSION=24.7.1-0
+ARG TARGETPLATFORM
+
+ENV PATH=/opt/conda/bin:$PATH \
+    CONDA_PREFIX=/opt/conda
+
+RUN chmod 777 -R /tmp && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libssl-dev \
+    curl \
+    g++ \
+    make \
+    git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
+    *)              MAMBA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -o ~/mambaforge.sh -v "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" && \
+    bash ~/mambaforge.sh -b -p /opt/conda && \
+    rm ~/mambaforge.sh
+
+RUN case ${TARGETPLATFORM} in \
+    "linux/arm64")  exit 1 ;; \
+    *)              /opt/conda/bin/conda update -y conda &&  \
+    /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
+
+
+WORKDIR /root
+
+COPY ./requirements.txt /lightllm/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r /lightllm/requirements.txt --ignore-installed --extra-index-url https://download.pytorch.org/whl/cu124
+
+RUN --mount=type=cache,target=/root/.cache/pip pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+RUN --mount=type=cache,target=/root/.cache/pip git clone https://github.com/ModelTC/LightKernel.git && cd LightKernel &&  pip install --no-deps -v .
+
+RUN apt-get update && apt-get install -y libnuma-dev wget devscripts debhelper dh-make build-essential dkms
+RUN apt-get install -y ibverbs-providers infiniband-diags perftest rdma-core libibverbs-dev librdmacm-dev
+
+ENV CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
+
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+ && git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
+ && cd gdrcopy/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
+
+ # Fix DeepEP IBGDA symlink
+RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
+
+RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+ && tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz && mv nvshmem_src nvshmem \
+ && cd nvshmem \
+ && rm -f /root/nvshmem_src_cuda12-all-all-3.3.9.tar.gz \
+ && NVSHMEM_SHMEM_SUPPORT=0 \
+    NVSHMEM_UCX_SUPPORT=0 \
+    NVSHMEM_USE_NCCL=0 \
+    NVSHMEM_MPI_SUPPORT=0 \
+    NVSHMEM_IBGDA_SUPPORT=1 \
+    NVSHMEM_PMIX_SUPPORT=0 \
+    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    NVSHMEM_USE_GDRCOPY=1 \
+    cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 \
+ && cmake --build build --target install -j64
+
+ARG DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58
+RUN git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..
+
+WORKDIR /root/DeepEP
+ENV NVSHMEM_DIR=/root/nvshmem/install
+RUN NVSHMEM_DIR=/root/nvshmem/install python setup.py install
+
+RUN apt-get update && apt-get install -y cmake automake autotools-dev  libtool libz-dev && \
+    DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
+    rm -rf /usr/lib/ucx && \
+    rm -rf /opt/hpcx/ucx && \
+    cd /usr/local/src && \
+    git clone https://github.com/openucx/ucx.git && \
+    cd ucx && 			     \
+    git checkout v1.19.x &&	     \
+    ./autogen.sh && ./configure     \
+    --enable-shared             \
+    --disable-static            \
+    --disable-doxygen-doc       \
+    --enable-optimizations      \
+    --enable-cma                \
+    --enable-devel-headers      \
+    --with-cuda=/usr/local/cuda \
+    --with-verbs=yes                \
+    --with-dm                   \
+    --with-gdrcopy=/usr/local   \
+    --with-efa                  \
+    --enable-mt &&              \
+    make -j &&                      \
+    make -j install-strip &&        \
+    ldconfig;
+
+RUN apt-get update && apt-get install -y  pkg-config tmux net-tools ;  \
+    cd /usr/local/src; \
+    pip install --upgrade meson pybind11 patchelf; \
+    git clone https://github.com/ai-dynamo/nixl.git -b main && \
+    cd nixl && \
+    rm -rf build && \
+    mkdir build && \
+    meson setup build/ --prefix=/usr/local/nixl --buildtype=release && \
+    cd build && \
+    ninja && \
+    ninja install && \
+    cd .. && pip install . --no-deps;
+
+COPY . /lightllm
+RUN pip install -e /lightllm --no-cache-dir
@@ -8,6 +8,7 @@
 from lightllm.common.kv_trans_kernel.kv_trans import kv_trans
 from lightllm.common.kv_trans_kernel.kv_trans_v2 import kv_trans_v2_for_d_node, kv_trans_v2_for_p_node
 from lightllm.distributed.pynccl import PyNcclCommunicator
+from lightllm.common.kv_trans_kernel.nixl_kv_trans import mla_page_io
 
 logger = init_logger(__name__)
 
@@ -36,6 +37,57 @@ def alloc_kv_move_buffer(self, max_req_total_len):
         self.token_dim_size = self.kv_move_buffer.shape[-1] * self.kv_move_buffer.shape[-2]
         return
 
+    def alloc_paged_kv_move_buffer(self, page_num, page_size) -> torch.Tensor:
+        self.kv_move_buffer = torch.empty(
+            (page_num, page_size, self.layer_num, self.head_num, self.head_dim), dtype=self.dtype, device="cuda"
+        )
+        self._buffer_mem_indexes_tensors = [
+            torch.empty((page_size,), dtype=torch.int64, device="cpu", pin_memory=True) for _ in range(page_num)
+        ]
+        return self.kv_move_buffer
+
+    def write_mem_to_page_kv_move_buffer(
+        self,
+        mem_indexes: List[int],
+        page_index: int,
+        dp_index: int,
+        mem_managers: List["MemoryManager"],
+        dp_world_size: int,
+    ):
+        cur_page = self.kv_move_buffer[page_index]
+        pin_mem_indexes = self._buffer_mem_indexes_tensors[page_index][0 : len(mem_indexes)]
+        pin_mem_indexes.numpy()[:] = mem_indexes
+        mem_indexes_gpu = pin_mem_indexes.cuda(non_blocking=True)
+        dp_mems = mem_managers[(dp_index * dp_world_size) : ((dp_index + 1) * dp_world_size)]
+        mla_page_io(
+            mem_indexes=mem_indexes_gpu,
+            page_tensor=cur_page,
+            kv_buffer=dp_mems[0].kv_buffer,
+            mode="write",
+        )
+        return
+
+    def read_page_kv_move_buffer_to_mem(
+        self,
+        mem_indexes: List[int],
+        page_index: int,
+        dp_index: int,
+        mem_managers: List["MemoryManager"],
+        dp_world_size: int,
+    ):
+        cur_page = self.kv_move_buffer[page_index]
+        pin_mem_indexes = self._buffer_mem_indexes_tensors[page_index][0 : len(mem_indexes)]
+        pin_mem_indexes.numpy()[:] = mem_indexes
+        mem_indexes_gpu = pin_mem_indexes.cuda(non_blocking=True)
+        dp_mems = mem_managers[(dp_index * dp_world_size) : ((dp_index + 1) * dp_world_size)]
+        for mem in dp_mems:
+            mla_page_io(
+                mem_indexes=mem_indexes_gpu,
+                page_tensor=cur_page,
+                kv_buffer=mem.kv_buffer,
+                mode="read",
+            )
+
     def send_to_decode_node(
         self,
         move_tasks: List[KVMoveTask],
 
@@ -219,6 +219,91 @@ def moe_align1(
     )
 
 
+@triton.jit
+def moe_align_fused_kernel(
+    topk_ids_ptr,  # [token_num, topk]
+    topk_weights_ptr,  # [token_num, topk]
+    expert_to_token_index_ptr,  # [expert_num, token_num * topk]
+    expert_to_weight_ptr,  # [expert_num, token_num * topk]
+    expert_token_num_ptr,  # [expert_num]
+    token_num,
+    topk_num: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    token_block = tl.program_id(0)
+    offs = token_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < token_num * topk_num
+
+    expert_ids = tl.load(topk_ids_ptr + offs, mask=mask, other=0)
+    weights = tl.load(topk_weights_ptr + offs, mask=mask, other=0.0)
+
+    # 用 atomic_add 给 expert 分配写位置
+    write_pos = tl.atomic_add(expert_token_num_ptr + expert_ids, 1, mask=mask)
+
+    # 按 token 顺序写 index 和 weight
+    tl.store(
+        expert_to_token_index_ptr + expert_ids * (token_num * topk_num) + write_pos,
+        offs,
+        mask=mask,
+    )
+    tl.store(
+        expert_to_weight_ptr + expert_ids * (token_num * topk_num) + write_pos,
+        weights,
+        mask=mask,
+    )
+
+
+def _get_moe_align_fused_static_key(
+    topk_weights: torch.Tensor,
+) -> dict:
+    topk_num = topk_weights.shape[1]
+    return {
+        "topk_num": topk_num,
+    }
+
+
+def _get_moe_align_fused_configs():
+    return [
+        {
+            "BLOCK_SIZE": bt,
+            "num_warps": nw,
+        }
+        for nw in [1, 2, 4, 8]
+        for bt in [128, 256, 512, 1024, 2048]
+    ]
+
+
+@autotune(
+    kernel_name="moe_align_fused:v1",
+    configs_gen_func=_get_moe_align_fused_configs,
+    static_key_func=_get_moe_align_fused_static_key,
+    run_key_func=lambda topk_ids: topk_ids.shape[0],
+    mutates_args=["expert_to_token_index", "expert_to_weight", "expert_token_num"],
+)
+def moe_align_fused(
+    expert_to_token_index, expert_to_weight, expert_token_num, topk_ids, topk_weights, run_config: Optional[dict] = None
+):
+    token_num, topk_num = topk_ids.shape
+    if run_config is None:
+        run_config = {}
+    BLOCK_SIZE = run_config.get("BLOCK_SIZE", 256)
+    num_warps = run_config.get("num_warps", 4)
+
+    grid = (triton.cdiv(token_num * topk_num, BLOCK_SIZE),)
+    moe_align_fused_kernel[grid](
+        topk_ids,
+        topk_weights,
+        expert_to_token_index,
+        expert_to_weight,
+        expert_token_num,
+        token_num,
+        topk_num,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return expert_to_token_index, expert_to_weight, expert_token_num
+
+
 @triton.jit
 def moe_align2_kernel(
     experts_token_num_ptr,  # [expert_num,]
@@ -719,9 +804,14 @@ def fused_experts_impl(
 
         expert_to_tokens = torch.empty((E, topk_num * tokens_in_chunk), dtype=torch.int32, device="cuda")
         expert_to_weights = torch.empty((E, topk_num * tokens_in_chunk), dtype=torch.float32, device="cuda")
-        moe_align(topk_ids=curr_topk_ids, out=expert_to_tokens)
-        expert_to_token_num = torch.empty((E,), dtype=torch.int32, device="cuda")
-        moe_align1(expert_to_tokens, curr_topk_weights, expert_to_weights, expert_to_token_num, topk=topk_num)
+        expert_to_token_num = torch.zeros((E,), dtype=torch.int32, device="cuda")
+        moe_align_fused(
+            expert_to_token_index=expert_to_tokens,
+            expert_to_weight=expert_to_weights,
+            expert_token_num=expert_to_token_num,
+            topk_ids=curr_topk_ids,
+            topk_weights=curr_topk_weights,
+        )
 
         reused_mblock_infos = grouped_matmul(
             curr_topk_ids.numel(),