PaddlePaddle
diff --git a/‎.github/workflows/_base_test.yml
Lines changed: 166 additions & 0 deletions b/‎.github/workflows/_base_test.yml
Lines changed: 166 additions & 0 deletions
diff --git a/‎.github/workflows/_build_linux.yml
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/_build_linux.yml
Lines changed: 1 addition & 3 deletions
diff --git a/‎.github/workflows/_clone_linux.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_clone_linux.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_logprob_test_linux.yml
Lines changed: 22 additions & 9 deletions b/‎.github/workflows/_logprob_test_linux.yml
Lines changed: 22 additions & 9 deletions
diff --git a/‎.github/workflows/_unit_test_coverage.yml
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/_unit_test_coverage.yml
Lines changed: 2 additions & 3 deletions
diff --git a/‎.github/workflows/ci_gcu.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ci_gcu.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/ci_iluvatar.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/ci_iluvatar.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/pr_build_and_test.yml
Lines changed: 13 additions & 3 deletions b/‎.github/workflows/pr_build_and_test.yml
Lines changed: 13 additions & 3 deletions
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
Lines changed: 14 additions & 15 deletions b/‎custom_ops/gpu_ops/append_attn/append_attention_c16_impl.cuh
Lines changed: 14 additions & 15 deletions
@@ -0,0 +1,166 @@
+name: Base Test
+description: "Run Base Tests"
+
+on:
+  workflow_call:
+    inputs:
+      DOCKER_IMAGE:
+        description: "Build Images"
+        required: true
+        type: string
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
+      FASTDEPLOY_ARCHIVE_URL:
+        description: "URL of the compressed FastDeploy code archive."
+        required: true
+        type: string
+      FASTDEPLOY_WHEEL_URL:
+        description: "URL of the FastDeploy Wheel."
+        required: true
+        type: string
+      CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+      MODEL_CACHE_DIR:
+        description: "Cache Dir Use"
+        required: false
+        type: string
+        default: ""
+
+jobs:
+  base_tests:
+    runs-on: [self-hosted, GPU-h20-1Cards]
+    steps:
+      - name: Code Prepare
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
+        run: |
+            set -x
+            REPO="https://github.com/${{ github.repository }}.git"
+            FULL_REPO="${{ github.repository }}"
+            REPO_NAME="${FULL_REPO##*/}"
+            BASE_BRANCH="${{ github.base_ref }}"
+
+            # Clean the repository directory before starting
+            docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
+            -e "REPO_NAME=${REPO_NAME}" \
+            ${docker_image} /bin/bash -c '
+              if [ -d ${REPO_NAME} ]; then
+                echo "Directory ${REPO_NAME} exists, removing it..."
+                rm -rf ${REPO_NAME}*
+              fi
+            '
+
+            wget -q ${fd_archive_url}
+            tar -xf FastDeploy.tar.gz
+            rm -rf FastDeploy.tar.gz
+            cd FastDeploy
+            git config --global user.name "FastDeployCI"
+            git config --global user.email "[email protected]"
+            git log -n 3 --oneline
+
+      - name: Run FastDeploy Base Tests
+        shell: bash
+        env:
+          docker_image: ${{ inputs.DOCKER_IMAGE }}
+          fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
+          CACHE_DIR: ${{ inputs.CACHE_DIR }}
+          MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
+        run: |
+            runner_name="${{ runner.name }}"
+            last_char="${runner_name: -1}"
+
+            if [[ "$last_char" =~ [0-7] ]]; then
+              DEVICES="$last_char"
+            else
+              DEVICES="0"
+            fi
+
+            FLASK_PORT=$((42068 + DEVICES * 100))
+            FD_API_PORT=$((42088 + DEVICES * 100))
+            FD_ENGINE_QUEUE_PORT=$((42058 + DEVICES * 100))
+            FD_METRICS_PORT=$((42078 + DEVICES * 100))
+            echo "Test ENV Parameter:"
+            echo "========================================================="
+            echo "FLASK_PORT=${FLASK_PORT}"
+            echo "FD_API_PORT=${FD_API_PORT}"
+            echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+            echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+            echo "DEVICES=${DEVICES}"
+            echo "========================================================="
+
+            CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
+            echo "CACHE_DIR is set to ${CACHE_DIR}"
+            if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
+              touch "${CACHE_DIR}/gitconfig"
+            fi
+            if [ ! -d "${MODEL_CACHE_DIR}" ]; then
+              echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
+              exit 1
+            fi
+
+            PARENT_DIR=$(dirname "$WORKSPACE")
+
+            docker run --rm --ipc=host --pid=host --net=host \
+            -v $(pwd):/workspace \
+            -w /workspace \
+            -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
+            -e "FD_API_PORT=${FD_API_PORT}" \
+            -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
+            -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
+            -e "FLASK_PORT=${FLASK_PORT}" \
+            -v "${MODEL_CACHE_DIR}:/MODELDATA" \
+            -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
+            -v "${CACHE_DIR}/.cache:/root/.cache" \
+            -v "${CACHE_DIR}/ConfigDir:/root/.config" \
+            -e TZ="Asia/Shanghai" \
+            --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
+            # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+            python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
+            pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+            python -m pip install ${fastdeploy_wheel_url}
+            python -m pip install pytest
+
+            wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
+            chmod +x ./llm-deploy-linux-amd64
+            ./llm-deploy-linux-amd64 -python python3.10 \
+            -model_name ERNIE-4.5-0.3B-Paddle \
+            -model_path /MODELDATA \
+            --skip install
+
+            git config --global --add safe.directory /workspace/FastDeploy
+            cd FastDeploy
+            pushd test/ce/deploy
+            python3.10 deploy.py > dd.log 2>&1 &
+            sleep 3
+            curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
+              -H "Content-Type: application/json" \
+              -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
+
+            curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+            popd
+
+            pushd test/ce/server
+            export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
+            export TEMPLATE=TOKEN_LOGPROB
+            TEST_EXIT_CODE=0
+            python -m pytest -sv test_base_chat.py test_compare_top_logprobs.py test_logprobs.py test_params_boundary.py test_seed_usage.py test_stream.py test_evil_cases.py || TEST_EXIT_CODE=1
+            curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
+              -H "Content-Type: application/json" \
+              -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\", \"--early-stop-config\": \"{\\\"enable_early_stop\\\":true, \\\"window_size\\\":6, \\\"threshold\\\":0.93}\"}"
+            curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+            python -m pytest -sv test_repetition_early_stop.py || TEST_EXIT_CODE=1
+            popd
+            echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
+            '
+            if [ -f ./FastDeploy/exit_code.env ]; then
+              source ./FastDeploy/exit_code.env
+              cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
+            fi
+            echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
+            exit ${TEST_EXIT_CODE}
@@ -125,9 +125,7 @@ jobs:
               export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}"
             fi
             python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
-            pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
-            pip config set install.trusted-host  pip.baidu.com
-            pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+            pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
             python -m pip install --upgrade pip
             python -m pip install -r requirements.txt
 
@@ -68,7 +68,7 @@ jobs:
             branch_name=${{ github.ref_name }}
             target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}
           fi
-          wget  -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
+          wget -O bos_tools.py -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
           push_file=$(realpath bos_tools.py)
           python -m pip install bce-python-sdk==0.9.29
           ls
 
@@ -70,10 +70,18 @@ jobs:
             DEVICES="0"
           fi
 
-          FLASK_PORT=$((9160 + DEVICES * 100))
-          FD_API_PORT=$((9180 + DEVICES * 100))
-          FD_ENGINE_QUEUE_PORT=$((9150 + DEVICES * 100))
-          FD_METRICS_PORT=$((9170 + DEVICES * 100))
+          FLASK_PORT=$((42068 + DEVICES * 100))
+          FD_API_PORT=$((42088 + DEVICES * 100))
+          FD_ENGINE_QUEUE_PORT=$((42058 + DEVICES * 100))
+          FD_METRICS_PORT=$((42078 + DEVICES * 100))
+          echo "Test ENV Parameter:"
+          echo "========================================================="
+          echo "FLASK_PORT=${FLASK_PORT}"
+          echo "FD_API_PORT=${FD_API_PORT}"
+          echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
+          echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
+          echo "DEVICES=${DEVICES}"
+          echo "========================================================="
 
           CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
           echo "CACHE_DIR is set to ${CACHE_DIR}"
@@ -86,8 +94,10 @@ jobs:
           fi
 
           PARENT_DIR=$(dirname "$WORKSPACE")
+          unset http_proxy
+          unset https_proxy
 
-          docker run --ipc=host --pid=host --net=host \
+          docker run --rm --ipc=host --pid=host --net=host \
           -v $(pwd):/workspace \
           -w /workspace \
           -e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
@@ -100,13 +110,12 @@ jobs:
           -v "${CACHE_DIR}/.cache:/root/.cache" \
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
-          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
+          --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
           # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
           python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
-          pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
-          pip config set install.trusted-host  pip.baidu.com
-          pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
           python -m pip install ${fastdeploy_wheel_url}
 
           wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
@@ -124,6 +133,10 @@ jobs:
               -d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
 
           curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
+          curl -s -o /dev/null -w "%{http_code}" -m 2 "http://0.0.0.0:${FD_API_PORT}/health"
+          curl -X POST "http://0.0.0.0:${FD_API_PORT}/v1/chat/completions" \
+            -H "Content-Type: application/json" \
+            -d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
           set +e
           rm -rf ./baseline_output
           cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output
 
@@ -96,9 +96,8 @@ jobs:
             # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
             python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 
-            pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
-            pip config set install.trusted-host  pip.baidu.com
-            pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+            pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
 
             python -m pip install coverage
             python -m pip install diff-cover
 
@@ -13,7 +13,8 @@ concurrency:
 
 jobs:
   CI_GCU:
-    runs-on: [self-hosted, GCU-S60-8Card]
+    runs-on:
+      group: GCU
     steps:
       - name: Print current runner name
         run: |
 
@@ -11,7 +11,8 @@ concurrency:
 
 jobs:
   CI_ILUVATAR:
-    runs-on: [self-hosted, IXUCA]
+    runs-on:
+      group: IXUCA
     steps:
       - name: Print current runner name
         run: |
 
@@ -19,7 +19,7 @@ jobs:
     needs: clone
     uses: ./.github/workflows/_build_linux.yml
     with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       COMPILE_ARCH: "89,90"
       WITH_NIGHTLY_BUILD: "OFF"
@@ -39,7 +39,7 @@ jobs:
     needs: [clone,build]
     uses: ./.github/workflows/_unit_test_coverage.yml
     with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
 
@@ -48,7 +48,7 @@ jobs:
     needs: [build]
     uses: ./.github/workflows/_logprob_test_linux.yml
     with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
       PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
       MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache"
@@ -61,3 +61,13 @@ jobs:
       DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
       FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
       FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+
+  base_test:
+    name: Run Base Tests
+    needs: [clone,build]
+    uses: ./.github/workflows/_base_test.yml
+    with:
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
+      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
+      FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
+      MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache"
@@ -167,3 +167,6 @@ build
 .ccls-cache
 
 third_party
+
+custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_*.cu
+custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_template.h
@@ -1061,12 +1061,11 @@ void MultiQueryAppendAttention(
     if (!is_decoder) {
       chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
     }
-    const int num_chunks = div_up(max_dec_len, chunk_size);
 
+    const int num_chunks = div_up(max_seq_len, chunk_size);
     dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
     dim3 blocks(32, num_warps);
-
-    if (num_chunks <= 1) {
+    if (num_chunks <= 0) {
       auto nosplit_kv_kernel =
           multi_query_append_attention_warp1_4_kernel<NV_TYPE,
                                                       false,
@@ -1161,8 +1160,8 @@ void MultiQueryAppendAttention(
           reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
           reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
           shift_bias ? reinterpret_cast<NV_TYPE *>(
-                           const_cast<T *>(shift_bias.get().data<T>()))
-                     : nullptr,
+                            const_cast<T *>(shift_bias.get().data<T>()))
+                      : nullptr,
           smooth_weight ? reinterpret_cast<NV_TYPE *>(
                               const_cast<T *>(smooth_weight.get().data<T>()))
                         : nullptr,
@@ -1208,8 +1207,8 @@ void MultiQueryAppendAttention(
                 seq_lens_encoder.data<int>(),
                 cu_seqlens_q.data<int>(),
                 shift_bias ? reinterpret_cast<NV_TYPE *>(
-                                 const_cast<T *>(shift_bias.get().data<T>()))
-                           : nullptr,
+                                  const_cast<T *>(shift_bias.get().data<T>()))
+                            : nullptr,
                 smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
                                     smooth_weight.get().data<T>()))
                               : nullptr,
@@ -1226,14 +1225,14 @@ void MultiQueryAppendAttention(
         constexpr int blockx = HEAD_DIM / vec_size;
         constexpr int blocky = (128 + blockx - 1) / blockx;
         dim3 grids_merge(min(sm_count * 4, token_num),
-                         num_heads);
+                          num_heads);
         dim3 blocks_merge(blockx, blocky);
         merge_multi_chunks_v2_kernel<NV_TYPE,
-                                     vec_size,
-                                     blocky,
-                                     HEAD_DIM,
-                                     OUT_NV_TYPE,
-                                     ENABLE_PREFILL>
+                                      vec_size,
+                                      blocky,
+                                      HEAD_DIM,
+                                      OUT_NV_TYPE,
+                                      ENABLE_PREFILL>
             <<<grids_merge, blocks_merge, 0, stream>>>(
                 reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
                 static_cast<float *>(tmp_m->ptr()),
@@ -1244,8 +1243,8 @@ void MultiQueryAppendAttention(
                 batch_id_per_token.data<int>(),
                 cu_seqlens_q.data<int>(),
                 shift_bias ? reinterpret_cast<NV_TYPE *>(
-                                 const_cast<T *>(shift_bias.get().data<T>()))
-                           : nullptr,
+                                  const_cast<T *>(shift_bias.get().data<T>()))
+                            : nullptr,
                 smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
                                     smooth_weight.get().data<T>()))
                               : nullptr,