kvcache-ai
diff --git a/‎.github/ISSUE_TEMPLATE/3.performance_dicussion.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/ISSUE_TEMPLATE/3.performance_dicussion.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 92 additions & 13 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 92 additions & 13 deletions
diff --git a/‎.github/workflows/release.yaml‎
Lines changed: 97 additions & 1 deletion b/‎.github/workflows/release.yaml‎
Lines changed: 97 additions & 1 deletion
diff --git a/‎.typos.toml‎
Lines changed: 3 additions & 4 deletions b/‎.typos.toml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎doc/en/build.md‎
Lines changed: 5 additions & 0 deletions b/‎doc/en/build.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎doc/en/ep-backend.md‎
Lines changed: 70 additions & 0 deletions b/‎doc/en/ep-backend.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎doc/en/error-code.md‎
Lines changed: 3 additions & 2 deletions b/‎doc/en/error-code.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎doc/en/heterogeneous_ascend.md‎
Lines changed: 3 additions & 0 deletions b/‎doc/en/heterogeneous_ascend.md‎
Lines changed: 3 additions & 0 deletions
@@ -1,4 +1,4 @@
-name: "⚙️ Preformance discussions"
+name: "⚙️ Performance discussions"
 description: "Questions about Mooncake's performance"
 title: "[Performance]: "
 labels: ["performance"]
 
@@ -91,14 +91,6 @@ jobs:
         python ./bootstrap_server.py &
       shell: bash
 
-    - name: Start Mooncake Master
-      run: |
-        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
-        # Set a small kv lease ttl to make the test faster.
-        # Must be consistent with the client test parameters.
-        mooncake_master --default_kv_lease_ttl=500 &
-      shell: bash
-
     - name: Test (in build env)
       run: |
         cd build
@@ -107,11 +99,6 @@ jobs:
         MC_METADATA_SERVER=http://127.0.0.1:8080/metadata DEFAULT_KV_LEASE_TTL=500 make test -j ARGS="-V"
       shell: bash
 
-    - name: Stop Mooncake Master Service
-      run: |
-        pkill mooncake_master || true
-      shell: bash
-
     - name: Generate Python version tag
       id: generate_tag_build
       run: |
@@ -307,6 +294,98 @@ jobs:
         name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}
         path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
 
+  build-with-ep:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.12']
+    env:
+      BUILD_WITH_EP: "1"
+      SCCACHE_GHA_ENABLED: "true"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Free up disk space
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /opt/hostedtoolcache/CodeQL
+
+    - name: Install CUDA Toolkit
+      uses: Jimver/[email protected]
+      with:
+        cuda: '12.8.1'
+        linux-local-args: '["--toolkit"]'
+        method: 'network'
+        sub-packages: '["nvcc", "nvrtc-dev"]'
+        non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
+
+    - name: Run sccache-cache
+      uses: mozilla-actions/[email protected]
+
+    - name: Configure sccache
+      uses: actions/github-script@v7
+      with:
+        script: |
+          core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
+          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+    - name: Run sccache stat for check
+      shell: bash
+      run: ${SCCACHE_PATH} --show-stats
+
+    - name: Install dependencies
+      run: |
+        sudo apt update -y
+        sudo bash -x dependencies.sh -y
+        pip install toml-cli  # for updating the version
+        pip install torch==2.8.0
+      shell: bash
+
+    - name: Build transfer engine with EP
+      run: |
+        mkdir build
+        cd build
+        export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        cmake .. -DUSE_ETCD=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DUSE_CUDA=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_EP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
+        make -j
+        sudo make install
+      shell: bash
+
+    - name: Build nvlink_allocator.so
+      run: |
+        mkdir -p build/mooncake-transfer-engine/nvlink-allocator
+        cd mooncake-transfer-engine/nvlink-allocator
+        bash build.sh --ci-build ../../build/mooncake-transfer-engine/nvlink-allocator/
+      shell: bash
+
+    - name: Generate Python version tag
+      id: generate_tag_flags
+      run: |
+        echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
+      shell: bash
+
+    - name: Build Python wheel
+      run: |
+        BASE_VERSION=$(toml get --toml-path mooncake-wheel/pyproject.toml project.version | tr -d '"')
+        toml set --toml-path mooncake-wheel/pyproject.toml project.version "${BASE_VERSION}+ep"
+        # Build wheel with specific Python version
+        PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh
+      shell: bash
+
+    - name: Upload Python wheel artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}+ep
+        path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
+
   build-docker:
     name: Build Docker Image
     runs-on: ubuntu-22.04
 
@@ -95,8 +95,104 @@ jobs:
           name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}
           path: mooncake-wheel/dist-py${{ steps.generate_tag_release.outputs.python_version_tag }}/*.whl
 
+  build-with-ep:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.12']
+    env:
+      BUILD_WITH_EP: "1"
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Free up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+
+      - name: Install CUDA Toolkit
+        uses: Jimver/[email protected]
+        with:
+          cuda: '12.8.1'
+          linux-local-args: '["--toolkit"]'
+          method: 'network'
+          sub-packages: '["nvcc", "nvrtc-dev"]'
+          non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
+
+      - name: Run sccache-cache
+        uses: mozilla-actions/[email protected]
+
+      - name: Configure sccache
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: Run sccache stat for check
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
+
+      - name: Configure project
+        run: |
+          sudo apt update -y
+          sudo bash -x dependencies.sh -y
+          pip install toml-cli  # for updating the version
+          pip install torch==2.8.0
+          mkdir build
+          cd build
+          cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DUSE_CUDA=ON -DWITH_EP=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
+        shell: bash
+
+      - name: Build project
+        run: |
+          cd build
+          make -j
+          sudo make install
+        shell: bash
+
+      - name: Build nvlink_allocator.so
+        run: |
+          mkdir -p build/mooncake-transfer-engine/nvlink-allocator
+          cd mooncake-transfer-engine/nvlink-allocator
+          bash build.sh --ci-build ../../build/mooncake-transfer-engine/nvlink-allocator/
+        shell: bash
+
+      - name: Generate Python version tag
+        id: generate_tag_release
+        run: |
+          echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
+        shell: bash
+
+      - name: Build Python wheel
+        run: |
+          BASE_VERSION=$(toml get --toml-path mooncake-wheel/pyproject.toml project.version | tr -d '"')
+          toml set --toml-path mooncake-wheel/pyproject.toml project.version "${BASE_VERSION}+ep"
+          # Set LD_LIBRARY_PATH for wheel building
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+          PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_release.outputs.python_version_tag }} ./scripts/build_wheel.sh
+        env:
+          VERSION: ${{ env.VERSION }}
+
+      - name: Upload Python wheel artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}+ep
+          path: mooncake-wheel/dist-py${{ steps.generate_tag_release.outputs.python_version_tag }}/*.whl
+
   publish-release:
-    needs: build
+    needs:
+      - build
+      - build-with-ep
     runs-on: ubuntu-22.04
     permissions:
       contents: write
 
@@ -1,8 +1,7 @@
 [default]
-extend-ignore-words = ["CANN"]
-
-[files]
-extend-exclude = ["mooncake-ep/csrc/*.h"]
+extend-ignore-words = ["CANN", "ASO", "fre"]
 
 [default.extend-words]
 CANN = "CANN"
+ASO = "ASO"
+fre = "fre"
@@ -15,6 +15,7 @@ endif()
 option(WITH_STORE "build mooncake store library and sample code" ON)
 option(WITH_P2P_STORE "build p2p store library and sample code" OFF)
 option(WITH_RUST_EXAMPLE "build the Rust interface and sample code for the transfer engine" OFF)
+option(WITH_EP "build mooncake with expert parallelism support" OFF)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extern/pybind11)
 set(PYTHON_EXECUTABLE "python3")
@@ -51,6 +52,12 @@ if (WITH_STORE)
   include_directories(mooncake-store/include)
 endif()
 
+if (WITH_EP)
+  message(STATUS "Mooncake EP will be built")
+  add_subdirectory(mooncake-ep)
+  include_directories(mooncake-ep/include)
+endif()
+
 add_subdirectory(mooncake-integration)
 
 if (WITH_P2P_STORE)
 
@@ -6,6 +6,11 @@ This document describes how to build Mooncake from source.
    ```bash
    pip3 install mooncake-transfer-engine --upgrade
    ```
+- To install with the Mooncake Backend and Mooncake EP support, use the following command:
+   ```bash
+   # replace torch2.8.0 with the corresponding version
+   pip3 install mooncake-transfer-engine==0.3.7+ep --upgrade
+   ```
 
 ## Automatic
 
 
@@ -0,0 +1,70 @@
+# Mooncake EP & Mooncake Backend
+
+## Overview
+
+Mooncake EP is an adaption of [DeepEP](https://github.com/deepseek-ai/DeepEP) that supports **fault tolerance** and fast data transfer with **IBGDA**, designed as a critical component for large-scale, latency-sensitive MoE (Mixture of Experts) inference. Mooncake EP aims to retain full compatibility with the DeepEP API, with the addition of an `active_ranks` tensor passed to both the `dispatch` and `combine` functions to capture information about rank activeness. By integrating with the EPLB module, Mooncake EP ensures fault tolerance during MoE inference, enabling robust performance even in large-scale, fault-prone environments.
+
+Mooncake Backend is a PyTorch distributed backend (a replacement for NCCL and Gloo) that provides **fault-tolerant collective communication primitives** and can be seamlessly integrated into machine learning systems. Built with the [Transfer Engine](transfer-engine.md), Mooncake Backend ensures that collective communications can continue even in the event of rank failures. Furthermore, it reports these failures to the upper layers of the system, allowing for graceful error handling without disrupting ongoing operations.
+
+## Usage
+
+### Mooncake EP
+
+> **Note:** Mooncake EP currently supports only the low-latency transfer mode.
+
+The API is largely consistent with DeepEP's, with only minor differences in a few parameters. Mooncake EP exposes a `Buffer` that can be imported from `mooncake.mooncake_ep_buffer`. For example, refer to `mooncake-wheel/tests/test_mooncake_ep.py`.
+
+#### Buffer.get_buffer_size_hint()
+
+**Signature:**
+
+```python
+@staticmethod
+def get_ep_buffer_size_hint(num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int) -> int
+```
+
+Calculates the number of bytes to pre-allocate for data transfer.
+
+#### Buffer.\_\_init\_\_()
+
+**Signature:**
+
+```python
+def __init__(self, group: dist.ProcessGroup, num_ep_buffer_bytes: int = 0)
+```
+
+The constructor. Ensure that only one instance is created.
+
+- **group**: Must be a Mooncake Backend process group.
+- **num_ep_buffer_bytes**: The number of bytes acquired with `Buffer.get_buffer_size_hint()`
+
+#### Buffer.dispatch/Buffer.combine
+
+**Signature:** Similar to DeepEP's `low_latency_dispatch`/`low_latency_combine`, with two additional parameters:
+
+- **active_ranks**: A tensor of shape `(num_ranks,)` containing values of 0 or 1. The indices of the broken ranks will be set to 0.
+- **timeout_us**: The timeout in microseconds for a rank to be considered broken. Set to -1 for infinite timeout.
+
+### Mooncake Backend
+
+Basic usage:
+
+```python
+import torch
+import torch.distributed as dist
+from mooncake import ep
+
+active_ranks = torch.ones((world_size,), dtype=torch.int32, device="cuda")
+dist.init_process_group(
+    backend="mooncake",
+    rank=rank,
+    world_size=world_size,
+    pg_options=ep.MooncakeBackendOptions(active_ranks),
+)
+
+dist.all_gather(...)           # Standard API usage
+assert active_ranks.all()  # Verify that no ranks are broken
+```
+
+For a full example, see `mooncake-wheel/tests/test_mooncake_backend.py`.
+
@@ -35,18 +35,19 @@ Mooncake Store may generate various types of errors during execution. For most A
 | Segment Selection        | SHARD_INDEX_OUT_OF_RANGE (-100)| Shard index is out of bounds                                                                              |
 |                          | SEGMENT_NOT_FOUND (-101)       | No available segments found                                                                               |
 |                          | SEGMENT_ALREADY_EXISTS (-102)  | Segment already exists                                                                                    |
-| Handle Selection         | NO_AVAILABLE_HANDLE (-200)     | Memory allocation failed due to insufficient space.          |
+| Handle Selection         | NO_AVAILABLE_HANDLE (-200)     | Memory allocation failed due to insufficient space                                                        |
 | Version                  | INVALID_VERSION (-300)         | Invalid version                                                                                           |
 | Key                      | INVALID_KEY (-400)             | Invalid key                                                                                              |
 | Engine                   | WRITE_FAIL (-500)              | Write operation failed                                                                                    |
 | Parameter                | INVALID_PARAMS (-600)          | Invalid parameters                                                                                        |
 | Engine Operation         | INVALID_WRITE (-700)           | Invalid write operation                                                                                   |
 |                          | INVALID_READ (-701)            | Invalid read operation                                                                                    |
 |                          | INVALID_REPLICA (-702)         | Invalid replica operation                                                                                 |
-|                          | REPLICA_IS_NOT_READY (-703)    | Replica is not ready                                                                                      |
+| Object                   | REPLICA_IS_NOT_READY (-703)    | Replica is not ready                                                                                      |
 |                          | OBJECT_NOT_FOUND (-704)        | Object not found                                                                                          |
 |                          | OBJECT_ALREADY_EXISTS (-705)   | Object already exists                                                                                     |
 |                          | OBJECT_HAS_LEASE (-706)        | Object has lease                                                                                          |
+|                          | LEASE_EXPIRED (-707)           | Lease expired before data transfer completed                                                              |
 | Transfer                 | TRANSFER_FAIL (-800)           | Transfer operation failed                                                                                 |
 | RPC                      | RPC_FAIL (-900)                | RPC operation failed                                                                                      |
 | High Availability        | ETCD_OPERATION_ERROR (-1000)   | etcd operation failed                                                                                     |
 
@@ -9,6 +9,9 @@ Heterogeneous Ascend Transport is a high-performance data transmission library d
 
 > Current version only supports WRITE semantics. READ semantics will be implemented in future releases.
 
+## Enhanced HBM-to-DRAM Data Transfer Optimization
+The copy bandwidth from HBM to DRAM is constrained by the size of data blocks. Small data blocks smaller than 2MB result in underutilized bandwidth. We have implemented an optimization using "data aggregation + pipeline parallelism": first, small data blocks are aggregated into 8MB blocks within HBM before being transferred to DRAM, while data copying and RDMA transmission are executed in parallel. This solution effectively hides the HBM-DRAM copy latency and significantly reduces the overall transmission time.
+
 ## Build Instructions
 The `USE_ASCEND_HETEROGENEOUS` compilation option has been added to `mooncake-common/common.cmake` to control this feature:
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: "⚙️ Preformance discussions"`
	`1`	`+name: "⚙️ Performance discussions"`
`2`	`2`	`description: "Questions about Mooncake's performance"`
`3`	`3`	`title: "[Performance]: "`
`4`	`4`	`labels: ["performance"]`