diff --git a/.github/workflows/_build_xpu.yml b/.github/workflows/_build_xpu.yml new file mode 100644 index 00000000000..af6d63e70e2 --- /dev/null +++ b/.github/workflows/_build_xpu.yml @@ -0,0 +1,195 @@ +name: XPU-Build-Test + +on: + workflow_call: + inputs: + DOCKER_IMAGE: + description: "Build Images" + required: true + type: string + default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci" + FASTDEPLOY_ARCHIVE_URL: + description: "URL of the compressed FastDeploy code archive." + required: true + type: string + WITH_NIGHTLY_BUILD: + description: "Enable nightly build mode (e.g. add date suffix to version)" + required: false + type: string + default: "OFF" + FD_VERSION: + description: "FastDeploy Package Version" + required: false + type: string + default: "" + PADDLEVERSION: + description: "Paddle Version Build Use" + required: false + type: string + default: "" + PADDLE_WHL_URL: + description: "Paddle Wheel Package URL" + required: false + type: string + default: "" + outputs: + wheel_path: + description: "Output path of the generated wheel" + value: ${{ jobs.xpu-build-test.outputs.wheel_path }} +jobs: + xpu-build-test: + runs-on: [self-hosted, XPU-P800-8Card] + outputs: + wheel_path: ${{ steps.set_output.outputs.wheel_path }} + steps: + - name: Code Prepare + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + IS_PR: ${{ github.event_name == 'pull_request' }} + run: | + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + - name: FastDeploy Build + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_version: ${{ inputs.FD_VERSION }} + BRANCH_REF: ${{ github.ref_name }} + PADDLEVERSION: ${{ inputs.PADDLEVERSION }} + PADDLE_WHL_URL: ${{ inputs.PADDLE_WHL_URL }} + WITH_NIGHTLY_BUILD: ${{ inputs.WITH_NIGHTLY_BUILD }} + run: | + set -x + runner_name="${{ runner.name }}" + CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') + gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + + PARENT_DIR=$(dirname "$WORKSPACE") + echo "PARENT_DIR:$PARENT_DIR" + docker run --rm --net=host \ + --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + -v $(pwd):/workspace -w /workspace \ + -v "/ssd3:/ssd3" \ + -e "MODEL_PATH=/ssd3/model" \ + -e "http_proxy=$(git config --global --get http.proxy)" \ + -e "https_proxy=$(git config --global --get https.proxy)" \ + -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \ + -e TZ="Asia/Shanghai" \ + -e "FD_VERSION=${fd_version}" \ + -e "WITH_NIGHTLY_BUILD=${WITH_NIGHTLY_BUILD}" \ + -e "PADDLEVERSION=${PADDLEVERSION}" \ + -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \ + -e "BRANCH_REF=${BRANCH_REF}" \ + ${docker_image} /bin/bash -c ' + if [[ -n "${FD_VERSION}" ]]; then + export FASTDEPLOY_VERSION=${FD_VERSION} + echo "Custom FastDeploy version: ${FASTDEPLOY_VERSION}" + fi + + git config --global --add safe.directory /workspace/FastDeploy + chown -R $(whoami) /workspace/FastDeploy + cd FastDeploy + if [[ "${WITH_NIGHTLY_BUILD}" == "ON" ]];then + GIT_COMMIT_TIME=$(git --no-pager show -s --format=%ci HEAD) + DATE_ONLY=$(echo $GIT_COMMIT_TIME | sed "s/ .*//;s/-//g") + echo "Git Commit Time: $GIT_COMMIT_TIME" + echo "Date Only: $DATE_ONLY" + export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}" + fi + python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + # 针对不同分支和tag使用不同的PaddlePaddle安装包 + if [[ "${PADDLE_WHL_URL}" != "" ]];then + python -m pip install ${PADDLE_WHL_URL} + elif [[ "${PADDLEVERSION}" != "" ]];then + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + python -m pip install paddlepaddle-xpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ + else + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + # python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/ + python -m pip install https://paddle-whl.bj.bcebos.com/nightly/xpu-p800/paddlepaddle-xpu/paddlepaddle_xpu-3.4.0.dev20260107-cp310-cp310-linux_x86_64.whl + fi + + + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + bash custom_ops/xpu_ops/download_dependencies.sh develop + export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk + export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm + bash build.sh + ls ./dist/*.whl + ' + - name: Package Upload + id: set_output + run: | + set -x + if [[ "${{ github.event_name }}" == "pull_request" ]];then + commit_id=${{ github.event.pull_request.head.sha }} + pr_num=${{ github.event.pull_request.number }} + target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/xpu + elif [[ "${{ github.ref_type }}" == "tag" ]]; then + commit_id=${{ github.sha }} + tag_name=${{ github.ref_name }} + target_path=paddle-github-action/TAG/FastDeploy/${tag_name}/${commit_id}/xpu + else + commit_id=${{ github.sha }} + branch_name=${{ github.ref_name }} + target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}/xpu + fi + wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py + push_file=$(realpath bos_tools.py) + python3 --version + python3 -m pip install bce-python-sdk==0.9.29 + cd FastDeploy/dist/ + matches=($(ls fastdeploy*.whl)) + if [ ${#matches[@]} -ne 1 ]; then + echo "Error: Found ${#matches[@]} matching files, expected exactly 1" + exit 1 + fi + fd_wheel_name=${matches[0]} + echo "Found: $fd_wheel_name" + # tree -L 3 + python3 ${push_file} fastdeploy*.whl ${target_path} + target_path_stripped="${target_path#paddle-github-action/}" + WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name} + echo "wheel_path=${WHEEL_PATH}" >> $GITHUB_OUTPUT \ No newline at end of file diff --git a/.github/workflows/_xpu_4cards_case_test.yml b/.github/workflows/_xpu_4cards_case_test.yml new file mode 100644 index 00000000000..262a56bcd40 --- /dev/null +++ b/.github/workflows/_xpu_4cards_case_test.yml @@ -0,0 +1,195 @@ +name: xpu_4cards_case_test + +on: + workflow_call: + inputs: + DOCKER_IMAGE: + description: "Build Images" + required: true + type: string + default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci" + FASTDEPLOY_ARCHIVE_URL: + description: "URL of the compressed FastDeploy code archive." + required: true + type: string + FASTDEPLOY_WHEEL_URL: + description: "URL of the compressed FastDeploy whl ." + required: true + type: string + FD_VERSION: + description: "FastDeploy Package Version" + required: false + type: string + default: "" + PADDLEVERSION: + description: "Paddle Version Build Use" + required: false + type: string + default: "" + PADDLE_WHL_URL: + description: "Paddle Wheel Package URL" + required: false + type: string + default: "" + MODEL_PATH: + description: "MODEL Dir Use" + required: true + type: string + default: "" + +jobs: + run_xpu_4cards_cases: + runs-on: [self-hosted, XPU-P800-4Cards] + timeout-minutes: 60 + steps: + - name: Print current runner name + run: | + echo "Current runner name: ${{ runner.name }}" + - name: Code Prepare + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} + model_path: ${{ inputs.MODEL_PATH }} + run: | + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + + - name: Run CI unittest + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} + model_path: ${{ inputs.MODEL_PATH }} + run: | + runner_name="${{ runner.name }}" + last_char="${runner_name: -1}" + + if [[ "$last_char" == "1" ]]; then + xpu_id="4" + else + xpu_id="0" + fi + PARENT_DIR=$(dirname "$WORKSPACE") + echo "PARENT_DIR:$PARENT_DIR" + docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + -v $(pwd):/workspace -w /workspace \ + -v "/ssd3:/ssd3" \ + -e "MODEL_PATH=${model_path}" \ + -e "FASTDEPLOY_ARCHIVE_URL=${fd_archive_url}" \ + -e "FASTDEPLOY_WHEEL_URL=${fd_wheel_url}" \ + -e "PADDLEVERSION=${PADDLEVERSION}" \ + -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \ + -e "http_proxy=$(git config --global --get http.proxy)" \ + -e "https_proxy=$(git config --global --get https.proxy)" \ + -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \ + -e "XPU_ID=${xpu_id}" \ + ${docker_image} /bin/bash -c ' + echo "安装lsof工具..." + apt install -y lsof + + # 设置XPU_VISIBLE_DEVICES + if [[ "$XPU_ID" == "0" ]]; then + export XPU_VISIBLE_DEVICES="0,1,2,3" + else + export XPU_VISIBLE_DEVICES="4,5,6,7" + fi + echo "XPU_VISIBLE_DEVICES=$XPU_VISIBLE_DEVICES" + + # 下载和安装xre + echo "下载和安装xre..." + mkdir -p /workspace/deps + cd /workspace/deps + if [ ! -d "xre" ]; then + wget -q https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz + tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre + fi + cd - + export PATH=/workspace/deps/xre/bin:$PATH + + # 重启XPU卡 + echo "重启XPU卡..." + xpu-smi -r -i $XPU_VISIBLE_DEVICES + xpu-smi + set -e + git config --global --add safe.directory /workspace/FastDeploy + cd FastDeploy + python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + python -m pip install -r requirements.txt + echo "安装PaddlePaddle..." + # 针对不同分支和tag使用不同的PaddlePaddle安装包 + if [[ "${PADDLE_WHL_URL}" != "" ]];then + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + python -m pip install ${PADDLE_WHL_URL} + elif [[ "${PADDLEVERSION}" != "" ]];then + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + python -m pip install paddlepaddle-xpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ + else + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + # python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/ + python -m pip install https://paddle-whl.bj.bcebos.com/nightly/xpu-p800/paddlepaddle-xpu/paddlepaddle_xpu-3.4.0.dev20260107-cp310-cp310-linux_x86_64.whl + fi + echo "安装上游任务编译的fastdeploy-xpu..." + python -m pip install ${FASTDEPLOY_WHEEL_URL} + rm -rf fastdeploy + python -m pip install ${FASTDEPLOY_WHEEL_URL} --no-deps --target=/workspace/FastDeploy + echo "============================安装测试依赖============================" + python -m pip install openai -U + python -m pip uninstall -y triton + python -m pip install triton==3.3.0 + python -m pip install pytest + python -m pip install pytest-timeout + unset http_proxy + unset https_proxy + echo "============================开始运行pytest测试============================" + export PYTHONPATH=/workspace/FastDeploy/ + python -m pytest -v -s --tb=short tests/xpu_ci/4cards_cases/ + exit_code=$? + + if [ $exit_code -eq 0 ]; then + echo "============================4卡cases测试通过!============================" + else + echo "============================4卡cases测试失败,请检查日志!============================" + exit $exit_code + fi + ' \ No newline at end of file diff --git a/.github/workflows/_xpu_8cards_case_test.yml b/.github/workflows/_xpu_8cards_case_test.yml new file mode 100644 index 00000000000..f333519ccee --- /dev/null +++ b/.github/workflows/_xpu_8cards_case_test.yml @@ -0,0 +1,184 @@ +name: xpu_8cards_case_test + +on: + workflow_call: + inputs: + DOCKER_IMAGE: + description: "Build Images" + required: true + type: string + default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci" + FASTDEPLOY_ARCHIVE_URL: + description: "URL of the compressed FastDeploy code archive." + required: true + type: string + FASTDEPLOY_WHEEL_URL: + description: "URL of the compressed FastDeploy whl ." + required: true + type: string + FD_VERSION: + description: "FastDeploy Package Version" + required: false + type: string + default: "" + PADDLEVERSION: + description: "Paddle Version Build Use" + required: false + type: string + default: "" + PADDLE_WHL_URL: + description: "Paddle Wheel Package URL" + required: false + type: string + default: "" + MODEL_PATH: + description: "MODEL Dir Use" + required: true + type: string + default: "" + +jobs: + run_xpu_8cards_cases: + runs-on: [self-hosted, XPU-P800-8Cards] + timeout-minutes: 60 + steps: + - name: Print current runner name + run: | + echo "Current runner name: ${{ runner.name }}" + - name: Code Prepare + shell: bash + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} + model_path: ${{ inputs.MODEL_PATH }} + run: | + set -x + REPO="https://github.com/${{ github.repository }}.git" + FULL_REPO="${{ github.repository }}" + REPO_NAME="${FULL_REPO##*/}" + BASE_BRANCH="${{ github.base_ref }}" + docker pull ${docker_image} + # Clean the repository directory before starting + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ + -e "REPO_NAME=${REPO_NAME}" \ + ${docker_image} /bin/bash -c ' + CLEAN_RETRIES=3 + CLEAN_COUNT=0 + + while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do + echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..." + rm -rf "${REPO_NAME}"* || true + sleep 2 + + # Check if anything matching ${REPO_NAME}* still exists + if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "All ${REPO_NAME}* removed successfully" + break + fi + + CLEAN_COUNT=$((CLEAN_COUNT + 1)) + done + + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" + ls -ld "${REPO_NAME}"* + exit 1 + fi + ' + + wget -q --no-proxy ${fd_archive_url} + tar -xf FastDeploy.tar.gz + rm -rf FastDeploy.tar.gz + cd FastDeploy + git config --global user.name "FastDeployCI" + git config --global user.email "fastdeploy_ci@example.com" + git log -n 3 --oneline + + - name: Run CI unittest + env: + docker_image: ${{ inputs.DOCKER_IMAGE }} + fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }} + fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} + model_path: ${{ inputs.MODEL_PATH }} + run: | + runner_name="${{ runner.name }}" + last_char="${runner_name: -1}" + + PARENT_DIR=$(dirname "$WORKSPACE") + echo "PARENT_DIR:$PARENT_DIR" + docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + -v $(pwd):/workspace -w /workspace \ + -v "/ssd3:/ssd3" \ + -e "MODEL_PATH=${model_path}" \ + -e "FASTDEPLOY_ARCHIVE_URL=${fd_archive_url}" \ + -e "FASTDEPLOY_WHEEL_URL=${fd_wheel_url}" \ + -e "PADDLEVERSION=${PADDLEVERSION}" \ + -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \ + -e "http_proxy=$(git config --global --get http.proxy)" \ + -e "https_proxy=$(git config --global --get https.proxy)" \ + -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \ + ${docker_image} /bin/bash -c ' + echo "安装lsof工具..." + apt install -y lsof + + # 设置XPU_VISIBLE_DEVICES + export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" + echo "XPU_VISIBLE_DEVICES=$XPU_VISIBLE_DEVICES" + + # 下载和安装xre + echo "下载和安装xre..." + mkdir -p /workspace/deps + cd /workspace/deps + if [ ! -d "xre" ]; then + wget -q https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz + tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre + fi + cd - + export PATH=/workspace/deps/xre/bin:$PATH + + # 重启XPU卡 + echo "重启XPU卡..." + xpu-smi -r -i $XPU_VISIBLE_DEVICES + xpu-smi + set -e + git config --global --add safe.directory /workspace/FastDeploy + cd FastDeploy + python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + python -m pip install -r requirements.txt + echo "安装PaddlePaddle..." + # 针对不同分支和tag使用不同的PaddlePaddle安装包 + if [[ "${PADDLE_WHL_URL}" != "" ]];then + python -m pip install ${PADDLE_WHL_URL} + elif [[ "${PADDLEVERSION}" != "" ]];then + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + python -m pip install paddlepaddle-xpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ + else + python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y + # python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/ + python -m pip install https://paddle-whl.bj.bcebos.com/nightly/xpu-p800/paddlepaddle-xpu/paddlepaddle_xpu-3.4.0.dev20260107-cp310-cp310-linux_x86_64.whl + fi + echo "安装上游任务编译的fastdeploy-xpu..." + python -m pip install ${FASTDEPLOY_WHEEL_URL} + rm -rf fastdeploy + python -m pip install ${FASTDEPLOY_WHEEL_URL} --no-deps --target=/workspace/FastDeploy + echo "============================安装测试依赖============================" + python -m pip install openai -U + python -m pip uninstall -y triton + python -m pip install triton==3.3.0 + python -m pip install pytest + python -m pip install pytest-timeout + unset http_proxy + unset https_proxy + echo "============================开始运行pytest测试============================" + export PYTHONPATH=/workspace/FastDeploy/ + python -m pytest -v -s --tb=short tests/xpu_ci/8cards_cases/ + exit_code=$? + + if [ $exit_code -eq 0 ]; then + echo "============================8卡cases测试通过!============================" + else + echo "============================8卡cases测试失败,请检查日志!============================" + exit $exit_code + fi + ' \ No newline at end of file diff --git a/.github/workflows/ci_xpu.yml b/.github/workflows/ci_xpu.yml index 7cb88cc16e4..14e8511095d 100644 --- a/.github/workflows/ci_xpu.yml +++ b/.github/workflows/ci_xpu.yml @@ -2,82 +2,43 @@ name: CI_XPU on: pull_request: - branches: - - develop - - 'release/*' - workflow_dispatch: + types: [opened, synchronize] + branches: [develop, release/**] +permissions: read-all concurrency: - group: ${{ github.event.pull_request.number }}-xpu-ci + group: ${{ github.event.pull_request.number }}-${{ github.workflow }} cancel-in-progress: true jobs: - CI_XPU: - timeout-minutes: 60 - runs-on: [self-hosted, XPU-P800-8Card] - steps: - - name: Print current runner name - run: | - echo "Current runner name: ${{ runner.name }}" - # Because the system version is lower than 2.23, the checkout cannot be used. - # - name: Checkout code - # uses: actions/checkout@v4 + clone: + name: FD-Clone-Linux-XPU + uses: ./.github/workflows/_clone_linux.yml - - name: Code Checkout - env: - docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci - run: | - REPO="https://github.com/${{ github.repository }}.git" - FULL_REPO="${{ github.repository }}" - REPO_NAME="${FULL_REPO##*/}" - BASE_BRANCH="${{ github.base_ref }}" - # Clean the repository directory before starting - docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -e "REPO_NAME=${REPO_NAME}" \ - -e "BASE_BRANCH=${BASE_BRANCH}" \ - ${docker_image} /bin/bash -c ' - if [ -d ${REPO_NAME} ]; then - echo "Directory ${REPO_NAME} exists, removing it..." - rm -rf ${REPO_NAME} - fi - ' - git config --global user.name "FastDeployCI" - git config --global user.email "fastdeploy_ci@example.com" - git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH} - cd FastDeploy - if [ "${{ github.event_name }}" = "pull_request" ]; then - git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }} - git merge pr/${{ github.event.pull_request.number }} - git log -n 3 --oneline - else - git checkout ${{ github.sha }} - git log -n 3 --oneline - fi + xpu_build_test: + name: xpu_build_test + needs: [clone] + uses: ./.github/workflows/_build_xpu.yml + with: + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci - - name: Run CI unittest - env: - docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci - run: | - runner_name="${{ runner.name }}" - last_char="${runner_name: -1}" + xpu_4cards_case_test: + name: xpu_4cards_case_test + needs: [clone, xpu_build_test] + uses: ./.github/workflows/_xpu_4cards_case_test.yml + with: + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci + FASTDEPLOY_WHEEL_URL: ${{ needs.xpu_build_test.outputs.wheel_path }} + MODEL_PATH: /ssd3/model - if [[ "$last_char" == "1" ]]; then - xpu_id="4" - else - xpu_id="0" - fi - PARENT_DIR=$(dirname "$WORKSPACE") - echo "PARENT_DIR:$PARENT_DIR" - docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \ - -v $(pwd):/workspace -w /workspace \ - -v "/ssd3:/ssd3" \ - -e "MODEL_PATH=/ssd3/model" \ - -e "http_proxy=$(git config --global --get http.proxy)" \ - -e "https_proxy=$(git config --global --get https.proxy)" \ - -e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \ - -e "XPU_ID=${xpu_id}" \ - ${docker_image} /bin/bash -c " - git config --global --add safe.directory /workspace/FastDeploy - cd FastDeploy - bash scripts/run_xpu_ci_pytest.sh - " + xpu_8cards_case_test: + name: xpu_8cards_case_test + needs: [clone, xpu_build_test] + uses: ./.github/workflows/_xpu_8cards_case_test.yml + with: + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci + FASTDEPLOY_WHEEL_URL: ${{ needs.xpu_build_test.outputs.wheel_path }} + MODEL_PATH: /ssd3/model diff --git a/tests/xpu_ci/conftest.py b/tests/xpu_ci/4cards_cases/conftest.py similarity index 99% rename from tests/xpu_ci/conftest.py rename to tests/xpu_ci/4cards_cases/conftest.py index 9df30e8f00c..b6918d33123 100644 --- a/tests/xpu_ci/conftest.py +++ b/tests/xpu_ci/4cards_cases/conftest.py @@ -364,7 +364,7 @@ def get_script_dir(): """获取scripts目录路径""" # conftest.py在tests/xpu_ci_pytest/下,scripts在项目根目录下 current_dir = os.path.dirname(os.path.abspath(__file__)) - project_root = os.path.dirname(os.path.dirname(current_dir)) + project_root = os.path.dirname (os.path.dirname(os.path.dirname(current_dir))) return os.path.join(project_root, "scripts") diff --git a/tests/xpu_ci/test_ep4tp1_online.py b/tests/xpu_ci/4cards_cases/test_ep4tp1_online.py similarity index 100% rename from tests/xpu_ci/test_ep4tp1_online.py rename to tests/xpu_ci/4cards_cases/test_ep4tp1_online.py diff --git a/tests/xpu_ci/test_ep4tp4_all2all.py b/tests/xpu_ci/4cards_cases/test_ep4tp4_all2all.py similarity index 100% rename from tests/xpu_ci/test_ep4tp4_all2all.py rename to tests/xpu_ci/4cards_cases/test_ep4tp4_all2all.py diff --git a/tests/xpu_ci/test_ep4tp4_online.py b/tests/xpu_ci/4cards_cases/test_ep4tp4_online.py similarity index 100% rename from tests/xpu_ci/test_ep4tp4_online.py rename to tests/xpu_ci/4cards_cases/test_ep4tp4_online.py diff --git a/tests/xpu_ci/test_logprobs_21b_tp4.py b/tests/xpu_ci/4cards_cases/test_logprobs_21b_tp4.py similarity index 100% rename from tests/xpu_ci/test_logprobs_21b_tp4.py rename to tests/xpu_ci/4cards_cases/test_logprobs_21b_tp4.py diff --git a/tests/xpu_ci/test_mtp.py b/tests/xpu_ci/4cards_cases/test_mtp.py similarity index 100% rename from tests/xpu_ci/test_mtp.py rename to tests/xpu_ci/4cards_cases/test_mtp.py diff --git a/tests/xpu_ci/test_pd_03b_tp1.py b/tests/xpu_ci/4cards_cases/test_pd_03b_tp1.py similarity index 100% rename from tests/xpu_ci/test_pd_03b_tp1.py rename to tests/xpu_ci/4cards_cases/test_pd_03b_tp1.py diff --git a/tests/xpu_ci/test_pd_21b_tp2.py b/tests/xpu_ci/4cards_cases/test_pd_21b_tp2.py similarity index 100% rename from tests/xpu_ci/test_pd_21b_tp2.py rename to tests/xpu_ci/4cards_cases/test_pd_21b_tp2.py diff --git a/tests/xpu_ci/test_v1_mode.py b/tests/xpu_ci/4cards_cases/test_v1_mode.py similarity index 100% rename from tests/xpu_ci/test_v1_mode.py rename to tests/xpu_ci/4cards_cases/test_v1_mode.py diff --git a/tests/xpu_ci/test_vl_model.py b/tests/xpu_ci/4cards_cases/test_vl_model.py similarity index 100% rename from tests/xpu_ci/test_vl_model.py rename to tests/xpu_ci/4cards_cases/test_vl_model.py diff --git a/tests/xpu_ci/test_w4a8.py b/tests/xpu_ci/4cards_cases/test_w4a8.py similarity index 100% rename from tests/xpu_ci/test_w4a8.py rename to tests/xpu_ci/4cards_cases/test_w4a8.py diff --git a/tests/xpu_ci/8cards_cases/conftest.py b/tests/xpu_ci/8cards_cases/conftest.py new file mode 100644 index 00000000000..b6918d33123 --- /dev/null +++ b/tests/xpu_ci/8cards_cases/conftest.py @@ -0,0 +1,487 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +XPU CI测试框架 - 通用配置和辅助函数 + +这个文件包含了所有测试case共用的函数和fixture。 +主要功能: +1. 进程管理 - 启动和停止API服务器 +2. 健康检查 - 等待服务启动成功 +3. 资源清理 - 清理日志、core文件、消息队列等 +4. 环境配置 - 设置XPU相关环境变量 +""" + +import json +import os +import shutil +import subprocess +import time + +import pytest + + +def get_xpu_id(): + """获取XPU_ID环境变量""" + return int(os.getenv("XPU_ID", "0")) + + +def get_port_num(): + """根据XPU_ID计算端口号""" + xpu_id = get_xpu_id() + return 8188 + xpu_id * 100 + + +def stop_processes(): + """ + 停止所有相关进程(最小改动版,避免误杀 pytest) + """ + xpu_id = get_xpu_id() # noqa: F841 + port_num = get_port_num() + + # 获取 pytest 主进程 PID + try: + pytest_pids = subprocess.check_output("pgrep -f pytest || true", shell=True).decode().strip().split() + except subprocess.CalledProcessError: + pytest_pids = [] + + def safe_kill_cmd(cmd): + """执行 kill 命令,但排除 pytest 进程""" + try: + # 先执行命令获取到候选 PID(kill -9 替换成 cat) + list_cmd = cmd.replace("kill -9", "cat") + output = subprocess.check_output(list_cmd, shell=True, stderr=subprocess.DEVNULL).decode().strip().split() + + # 过滤:排除 pytest + safe_pids = [pid for pid in output if pid and pid not in pytest_pids] + + # 真正 kill + for pid in safe_pids: + subprocess.run(f"kill -9 {pid}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except Exception: + pass + + commands = [ + "ps -efww | grep -E 'cache_transfer_manager.py' | grep -v grep | awk '{print $2}' | xargs echo", + "ps -efww | grep -E 'api_server' | grep -v grep | awk '{print $2}' | xargs echo", + "ps -efww | grep -E 'multiprocessing' | grep -v grep | awk '{print $2}' | xargs echo", + "ps -efww | grep -E 'fastdeploy' | grep -v grep | awk '{print $2}' | xargs echo", + f"ps -efww | grep -E '{port_num}' | grep -v grep | awk '{{print $2}}' | xargs echo", + f"lsof -t -i :{port_num} | xargs echo", + ] + + # Kill additional ports + for port in range(port_num + 10, port_num + 41): + commands.append(f"lsof -t -i :{port} | xargs echo") + + # Kill processes using netstat + commands.extend( + [ + f"netstat -tunlp 2>/dev/null | grep {port_num + 2} | awk '{{print $NF}}' | awk -F'/' '{{print $1}}' | xargs echo", + f"netstat -tunlp 2>/dev/null | grep {port_num + 2} | awk '{{print $(NF-1)}}' | cut -d/ -f1 | grep -E '^[0-9]+$' | xargs echo", + ] + ) + + for cmd in commands: + safe_kill_cmd(cmd) + + +def cleanup_resources(): + """ + 清理资源 + + 包括: + 1. 删除log目录 + 2. 删除core文件 + 3. 清空消息队列 + """ + # 删除log目录 + if os.path.exists("log"): + shutil.rmtree("log") + + # 删除core文件 + subprocess.run("rm -f core*", shell=True) + + # 清空消息队列 + subprocess.run( + "ipcrm --all=msg 2>/dev/null || true", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + + +def wait_for_health_check(timeout=900, interval=10): + """ + 等待服务健康检查通过 + + Args: + timeout: 超时时间(秒), 默认15分钟 + interval: 检查间隔(秒), 默认10秒 + + Returns: + bool: 服务是否启动成功 + """ + port_num = get_port_num() + health_endpoint = f"http://0.0.0.0:{port_num}/health" + models_endpoint = f"http://0.0.0.0:{port_num}/v1/models" + start_time = time.time() + + print(f"开始服务健康检查,最长等待时间:{timeout}秒") + + # 第一阶段: 等待 /health 返回 200 + while True: + elapsed = int(time.time() - start_time) + + # 超时判断 + if elapsed >= timeout: + print(f"\n服务启动超时:经过 {timeout//60} 分钟服务仍未启动!") + return False + + # 发送健康检查请求 + try: + result = subprocess.run( + f'curl -s -o /dev/null -w "%{{http_code}}" -m 2 {health_endpoint}', + shell=True, + capture_output=True, + text=True, + ) + http_code = result.stdout.strip() + except Exception: + http_code = "000" + + print(f"\r服务健康检查中... 已等待 {elapsed} 秒,当前状态码:{http_code}", end="", flush=True) + + if http_code == "200": + print(f"\n健康检查通过!耗时 {elapsed} 秒") + break + + time.sleep(interval) + + # 第二阶段: 等待 /v1/models 返回有效模型列表,确保模型完全就绪 + print("开始验证模型是否就绪...") + while True: + elapsed = int(time.time() - start_time) + + # 超时判断 + if elapsed >= timeout: + print(f"\n模型就绪超时:经过 {timeout//60} 分钟模型仍未就绪!") + return False + + # 检查模型列表 + try: + result = subprocess.run(f"curl -s -m 5 {models_endpoint}", shell=True, capture_output=True, text=True) + response = result.stdout.strip() + if response: + data = json.loads(response) + # 检查是否有模型数据 + if data.get("data") and len(data["data"]) > 0: + model_id = data["data"][0].get("id", "unknown") + print(f"\n模型就绪!模型ID: {model_id}, 总耗时 {elapsed} 秒") + return True + except (json.JSONDecodeError, Exception) as e: # noqa: F841 + pass + + print(f"\r等待模型就绪中... 已等待 {elapsed} 秒", end="", flush=True) + time.sleep(interval) + + +def print_logs_on_failure(): + """失败时打印日志""" + print("\n========== server.log ==========") + if os.path.exists("server.log"): + with open("server.log", "r") as f: + print(f.read()) + + print("\n========== log/workerlog.0 ==========") + if os.path.exists("log/workerlog.0"): + with open("log/workerlog.0", "r") as f: + print(f.read()) + + +def start_server(server_args, wait_before_check=60): + """ + 启动API服务器 + + Args: + server_args: 服务器启动参数列表 + wait_before_check: 启动后等待多少秒再进行健康检查,默认60秒 + + Returns: + bool: 服务是否启动成功 + """ + # 停止旧进程 + stop_processes() + + # 清理资源 + cleanup_resources() + + # 构建启动命令 + cmd = ["python", "-m", "fastdeploy.entrypoints.openai.api_server"] + server_args + + # 启动服务(后台运行) + with open("server.log", "w") as log_file: + subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, start_new_session=True) + + print(f"服务启动命令: {' '.join(cmd)}") + print(f"等待 {wait_before_check} 秒...") + time.sleep(wait_before_check) + + # 健康检查 + if not wait_for_health_check(): + print_logs_on_failure() + stop_processes() + return False + + return True + + +@pytest.fixture(scope="function") +def xpu_env(): + """ + 设置XPU环境变量 + + 这个fixture会在每个测试开始时设置XPU_VISIBLE_DEVICES环境变量 + 测试结束后自动清理 + """ + xpu_id = get_xpu_id() + + # 设置XPU_VISIBLE_DEVICES + if xpu_id == 0: + os.environ["XPU_VISIBLE_DEVICES"] = "0,1,2,3" + else: + os.environ["XPU_VISIBLE_DEVICES"] = "4,5,6,7" + + print(f"\n设置环境变量: XPU_VISIBLE_DEVICES={os.environ['XPU_VISIBLE_DEVICES']}") + + yield + + # 测试结束后停止进程 + print("\n测试结束,停止服务...") + stop_processes() + + +def get_model_path(): + """获取MODEL_PATH环境变量""" + model_path = os.getenv("MODEL_PATH") + if not model_path: + raise ValueError("MODEL_PATH environment variable is not set") + return model_path + + +def setup_ep_env(): + """ + 设置EP(Expert Parallel)相关环境变量 + + Returns: + dict: 原始环境变量值,用于后续恢复 + """ + env_vars = { + "BKCL_ENABLE_XDR": "1", + "BKCL_RDMA_NICS": "eth1,eth1,eth2,eth2", + "BKCL_TRACE_TOPO": "1", + "BKCL_PCIE_RING": "1", + "XSHMEM_MODE": "1", + "XSHMEM_QP_NUM_PER_RANK": "32", + "BKCL_RDMA_VERBS": "1", + "MOE_FFN_USE_DENSE_INPUT": "1", + } + + # 保存原始值 + original_values = {} + for key in env_vars: + original_values[key] = os.environ.get(key) + + # 设置新值 + for key, value in env_vars.items(): + os.environ[key] = value + print(f"设置环境变量: {key}={value}") + + # 设置BKCL_RDMA_NICS + rdma_nics = get_rdma_nics() + if rdma_nics: + os.environ["BKCL_RDMA_NICS"] = rdma_nics + print(f"设置环境变量: BKCL_RDMA_NICS={rdma_nics}") + return original_values + + +def restore_env(original_values): + """ + 恢复环境变量 + + Args: + original_values: setup_ep_env()返回的原始环境变量值 + """ + for key, value in original_values.items(): + if value is None: + if key in os.environ: + del os.environ[key] + print(f"删除环境变量: {key}") + else: + os.environ[key] = value + print(f"恢复环境变量: {key}={value}") + + +def download_and_build_xdeepep(): + """下载并编译xDeepEP(用于EP并行测试)""" + if os.path.exists("xDeepEP"): + print("xDeepEP已存在,跳过下载") + return True + + print("下载xDeepEP...") + result = subprocess.run("wget -q https://paddle-qa.bj.bcebos.com/xpu_third_party/xDeepEP.tar.gz", shell=True) + if result.returncode != 0: + print("下载xDeepEP失败") + return False + + print("解压xDeepEP...") + result = subprocess.run("tar -xzf xDeepEP.tar.gz", shell=True) + if result.returncode != 0: + print("解压xDeepEP失败") + return False + + print("编译xDeepEP...") + result = subprocess.run("cd xDeepEP && bash build.sh && cd -", shell=True) + if result.returncode != 0: + print("编译xDeepEP失败") + return False + + return True + + +# ============ PD分离相关函数 ============ + + +def get_script_dir(): + """获取scripts目录路径""" + # conftest.py在tests/xpu_ci_pytest/下,scripts在项目根目录下 + current_dir = os.path.dirname(os.path.abspath(__file__)) + project_root = os.path.dirname (os.path.dirname(os.path.dirname(current_dir))) + return os.path.join(project_root, "scripts") + + +def get_rdma_nics(): + """ + 获取RDMA网卡配置 + + Returns: + str: KVCACHE_RDMA_NICS的值,失败返回空字符串 + """ + script_path = os.path.join(get_script_dir(), "get_rdma_nics.sh") + + try: + result = subprocess.run(f"bash {script_path} xpu", shell=True, capture_output=True, text=True) + output = result.stdout.strip() + # 解析 KVCACHE_RDMA_NICS=xxx 格式 + if output.startswith("KVCACHE_RDMA_NICS="): + return output.split("=", 1)[1] + return output + except Exception as e: + print(f"获取RDMA网卡失败: {e}") + return "" + + +def setup_pd_env(): + """ + 设置PD分离相关环境变量 + + Returns: + dict: 原始环境变量值,用于后续恢复 + """ + original_values = {} + env_keys = ["KVCACHE_GDRCOPY_FLUSH_ENABLE", "KVCACHE_RDMA_NICS", "CUDA_ENABLE_P2P_NO_UVA"] + + # 保存原始值 + for key in env_keys: + original_values[key] = os.environ.get(key) + + # 设置新值 + os.environ["KVCACHE_GDRCOPY_FLUSH_ENABLE"] = "1" + os.environ["CUDA_ENABLE_P2P_NO_UVA"] = "1" # 开启peer mem + print("设置环境变量: KVCACHE_GDRCOPY_FLUSH_ENABLE=1") + print("设置环境变量: CUDA_ENABLE_P2P_NO_UVA=1") + + # 获取并设置RDMA网卡 + rdma_nics = get_rdma_nics() + if rdma_nics: + os.environ["KVCACHE_RDMA_NICS"] = rdma_nics + print(f"设置环境变量: KVCACHE_RDMA_NICS={rdma_nics}") + + return original_values + + +def restore_pd_env(original_values): + """ + 恢复PD分离相关环境变量 + + Args: + original_values: setup_pd_env()返回的原始环境变量值 + """ + env_keys = ["KVCACHE_GDRCOPY_FLUSH_ENABLE", "KVCACHE_RDMA_NICS", "CUDA_ENABLE_P2P_NO_UVA"] + + for key in env_keys: + if key in original_values: + if original_values[key] is None: + if key in os.environ: + del os.environ[key] + print(f"删除环境变量: {key}") + else: + os.environ[key] = original_values[key] + print(f"恢复环境变量: {key}={original_values[key]}") + + +def setup_pd_ep_env(): + """ + 设置PD分离+EP相关环境变量 + + Returns: + dict: 原始环境变量值,用于后续恢复 + """ + original_values_pd = setup_pd_env() + original_values_ep = setup_ep_env() + original_values = {**original_values_pd, **original_values_ep} + return original_values + + +def restore_pd_ep_env(original_values): + """ + 恢复PD分离相关环境变量 + + Args: + original_values: setup_ep_env()返回的原始环境变量值 + """ + restore_env(original_values) + restore_pd_env(original_values) + + +def setup_logprobs_env(): + """ + 设置logprobs相关环境变量 + + Returns: + dict: 原始环境变量值,用于后续恢复 + """ + env_vars = { + "FD_USE_GET_SAVE_OUTPUT_V1": "1", + } + os.system("sysctl -w kernel.msgmax=131072") + os.system("sysctl -w kernel.msgmnb=33554432") + + # 保存原始值 + original_values = {} + for key in env_vars: + original_values[key] = os.environ.get(key) + + # 设置新值 + for key, value in env_vars.items(): + os.environ[key] = value + print(f"设置环境变量: {key}={value}") + return original_values diff --git a/tests/xpu_ci/test_pd_21b_tp1ep4.py b/tests/xpu_ci/8cards_cases/test_pd_21b_tp1ep4.py similarity index 98% rename from tests/xpu_ci/test_pd_21b_tp1ep4.py rename to tests/xpu_ci/8cards_cases/test_pd_21b_tp1ep4.py index b4aad965cf8..250e6a8a4b8 100644 --- a/tests/xpu_ci/test_pd_21b_tp1ep4.py +++ b/tests/xpu_ci/8cards_cases/test_pd_21b_tp1ep4.py @@ -32,6 +32,7 @@ import pytest from conftest import ( cleanup_resources, + download_and_build_xdeepep, get_model_path, get_port_num, restore_pd_ep_env, @@ -134,6 +135,9 @@ def start_pd_server(model_path, port_num, wait_before_check=60): # 清理资源 cleanup_resources() + if not download_and_build_xdeepep(): + pytest.fail("xDeepEP下载或编译失败") + # 清理并创建日志目录 for log_dir in ["log_router", "log_prefill", "log_decode"]: diff --git a/tests/xpu_ci/test_pd_21b_tp4ep4.py b/tests/xpu_ci/8cards_cases/test_pd_21b_tp4ep4.py similarity index 98% rename from tests/xpu_ci/test_pd_21b_tp4ep4.py rename to tests/xpu_ci/8cards_cases/test_pd_21b_tp4ep4.py index 2b69a0ed962..6b2ec6ee126 100644 --- a/tests/xpu_ci/test_pd_21b_tp4ep4.py +++ b/tests/xpu_ci/8cards_cases/test_pd_21b_tp4ep4.py @@ -32,6 +32,7 @@ import pytest from conftest import ( cleanup_resources, + download_and_build_xdeepep, get_model_path, get_port_num, restore_pd_ep_env, @@ -134,6 +135,9 @@ def start_pd_server(model_path, port_num, wait_before_check=60): # 清理资源 cleanup_resources() + + if not download_and_build_xdeepep(): + pytest.fail("xDeepEP下载或编译失败") # 清理并创建日志目录 for log_dir in ["log_router", "log_prefill", "log_decode"]: