PaddlePaddle · Liujie0926 · Jul 22, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 24, 2025
diff --git a/.github/workflows/distribute-a100.yml b/.github/workflows/distribute-a100.yml
@@ -0,0 +1,180 @@
+name: Distribute CI (A100)
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches: [develop]
+  schedule:
+    - cron: "1 0 * * *"
+  workflow_call:
+    inputs:
+      run_downstream:
+        required: true
+        type: string
+      image_name:
+        required: true
+        type: string
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+env:
+  PR_ID: ${{ github.event.pull_request.number }}
+  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+  TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distribut-A100
+  ci_scripts: /workspace/PaddleNLP/scripts/distribute
+  BRANCH: ${{ github.event.pull_request.base.ref }}
+  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
+  CI_name: distribute-ci
+  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
+  GITHUB_EVENT_NAME: ${{ github.event_name }}
+  RUN_DOWNSTREAM: ${{ inputs.run_downstream }}
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  distribute-a100-ci:
+    name: distribute-a100-ci
+    runs-on:
+      group: Distribute
+    steps:
+      - name: Determine Image Name
+        env:
+          IMAGE_NAME: ${{ inputs.image_name }}
+        run: |
+          if [[ -n "${IMAGE_NAME}" ]]; then
+            echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
+          else
+            echo "IMAGE_NAME=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82" >> "$GITHUB_ENV"
+          fi
+
+      - name: Run Container
+        env:
+          work_dir: ${{ github.workspace }}
+          CACHE_DIR: /home/data/cfs/.cache
+          FLAGS_dynamic_static_unified_comm: "True"
+          FLAGS_dataloader_use_file_descriptor: "False"
+          python_version: "3.10"
+          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
+        run: |
+          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
+          echo "container_name=${container_name}" >> "$GITHUB_ENV"
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            export CUDA_SO="$(\ls -d /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls -d /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+            export DEVICES="$(\ls -d /dev/nvidia* | xargs -I{} echo "-v {}:{}") $(\ls /dev/nvidia-caps/* | xargs -I{} echo "-v {}:{}")"
+            export SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
+            docker run -d -t --name ${container_name} ${CUDA_SO} ${DEVICES} ${SMI} --runtime=nvidia --shm-size=32G \
+              --network host -v /dev/shm:/dev/shm \
+              -v $work_dir/../../..:$work_dir/../../.. \
+              -v $work_dir:/workspace \
+              -v /home/.cache/pip:/home/.cache/pip \
+              -v /home/FleetX_CI:/fleetx_data \
+              -v /home/Llm_gpt_CI:/llm_gpt_data \
+              -v /home/Llama_CI:/llama_data \
+              -e BRANCH \
+              -e AGILE_COMPILE_BRANCH \
+              -e PR_ID \
+              -e COMMIT_ID \
+              -e work_dir \
+              -e ci_scripts \
+              -e no_proxy \
+              -e CI_name \
+              -e paddle_whl \
+              -e FLAGS_dynamic_static_unified_comm \
+              -e FLAGS_dataloader_use_file_descriptor \
+              -e python_version \
+              -w /workspace $IMAGE_NAME
+          fi
+
+      - name: Download Code
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping.."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            rm -rf * .[^.]*
+            echo "Downloading PaddleNLP.tar.gz"
+            wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
+            echo "Extracting PaddleNLP.tar.gz"
+            tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
+            source $work_dir/../../../proxy
+            cd PaddleNLP
+            git config --global user.name "PaddleCI"
+            git config --global user.email "[email protected]"
+            git pull
+            git submodule update --init --recursive --force
+            if [ -n "${PR_ID}" ]; then
+              git fetch origin pull/${PR_ID}/head
+              git checkout -b PR_${PR_ID} FETCH_HEAD
+              git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
+              git fetch upstream ${BRANCH}
+              git merge ${BRANCH} --no-edit
+              git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
+            else
+              echo "Not in a pull_request event. Skipping PR-specific operations."
+            fi
+            git log --pretty=oneline -10
+            '
+          fi 
+
+      - name: Test
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            ldconfig
+            ln -sf $(which python${python_version}) /usr/bin/python
+            pip config set global.cache-dir "/home/.cache/pip"
+            source $work_dir/../../../proxy
+            set -e
+            cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
+            timeout 80m bash scripts/distribute/run_ci.sh ${paddle_whl}
+            '
+          fi
+
+      - name: Upload Logs
+        if: always()
+        env:
+          home_path: ${{ github.workspace }}/..
+          bos_file: ${{ github.workspace }}/../bos_retry/BosClient.py
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            unset http_proxy && unset https_proxy
+            if [ ! -f "${{ env.bos_file }}" ]; then
+              wget -q --no-proxy -O ${{ env.home_path }}/bos_retry.tar.gz https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
+              mkdir ${{ env.home_path }}/bos_retry
+              tar xf ${{ env.home_path }}/bos_retry.tar.gz -C ${{ env.home_path }}/bos_retry
+            fi
+
+            if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
+              bos_prefix="${PR_ID}/${COMMIT_ID}"
+            elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
+              bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
+            else
+              bos_prefix="schedule/$(date +%Y%m%d)"
+            fi
+
+            cd /workspace/case_logs
+            for FILE in /workspace/case_logs/*; do
+              file=$(basename "$FILE")
+              python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs
+              echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs/$file"
+            done
+            '
+          fi
+
+      - name: Terminate And Delete the Container
+        if: always()
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
+          docker rm -f $container_name 2>/dev/null || true
diff --git a/.github/workflows/llm.yml b/.github/workflows/llm.yml
@@ -0,0 +1,206 @@
+name: LLM CI
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches: [develop]
+    schedule:
+    - cron: "2 0 * * *"
+  workflow_call:
+    inputs:
+      run_downstream:
+        required: true
+        type: string
+      image_name:
+        required: true
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+env:
+  PR_ID: ${{ github.event.pull_request.number }}
+  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+  TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-llm
+  ci_scripts: /workspace/PaddleNLP/scripts/regression
+  BRANCH: ${{ github.event.pull_request.base.ref }}
+  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
+  CI_name: llm-ci
+  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
+  HF_ENDPOINT: https://hf-mirror.com
+  STUDIO_GIT_HOST: http://git.prod.idc-to-cloud.aistudio.baidu-int.com
+  PPNLP_HOME: /ssd1/paddlenlp
+  HF_DATASETS_CACHE: /ssd1/paddlenlp/huggingface/datasets
+  TRANSFORMERS_CACHE: /ssd1/paddlenlp/huggingface
+  CCACHE_DIR: /home/data/gzcfs/.ccache/gpubox
+  RUN_DOWNSTREAM: ${{ inputs.run_downstream }}
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  llm-ci:
+    name: llm-ci
+    runs-on: [self-hosted, ernie-8gpu]
+    steps:
+      - name: Determine Image Name
+        env:
+          IMAGE_NAME: ${{ inputs.image_name }}
+        run: |
+          if [[ -n "${IMAGE_NAME}" ]]; then
+            echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
+          else
+            echo "IMAGE_NAME=iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5-paddlenlp-latest" >> "$GITHUB_ENV"
+          fi
+
+      - name: Run Container
+        env:
+          work_dir: ${{ github.workspace }}
+          CACHE_DIR: /home/data/cfs/.cache
+          FLAGS_dynamic_static_unified_comm: "True"
+          python_version: "3.10"
+          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
+        run: |
+          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
+          echo "container_name=${container_name}" >> "$GITHUB_ENV"
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
+              -v $work_dir/../../..:$work_dir/../../.. \
+              -v $work_dir:/workspace \
+              -v /home/.cache/pip:/home/.cache/pip \
+              -v /ssd1/paddlenlp:/ssd1/paddlenlp \
+              -v /home/data/gzcfs/.ccache/gpubox:/home/data/gzcfs/.ccache/gpubox \
+              -e BRANCH \
+              -e AGILE_COMPILE_BRANCH \
+              -e PR_ID \
+              -e COMMIT_ID \
+              -e work_dir \
+              -e ci_scripts \
+              -e no_proxy \
+              -e CI_name \
+              -e paddle_whl \
+              -e HF_ENDPOINT \
+              -e STUDIO_GIT_HOST \
+              -e PPNLP_HOME \
+              -e HF_DATASETS_CACHE \
+              -e TRANSFORMERS_CACHE \
+              -e CACHE_DIR \
+              -e FLAGS_dynamic_static_unified_comm \
+              -e python_version \
+              -w /workspace --runtime=nvidia $IMAGE_NAME
+          fi
+
+      - name: Download Code
+        env:
+          work_dir: ${{ github.workspace }}
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping.."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            rm -rf * .[^.]*
+            echo "Downloading PaddleNLP.tar.gz"
+            wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
+            echo "Extracting PaddleNLP.tar.gz"
+            tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
+            source $work_dir/../../../proxy
+            cd PaddleNLP
+            git config --global user.name "PaddleCI"
+            git config --global user.email "[email protected]"
+            git pull
+            git submodule update --init --recursive --force
+            if [ -n "${PR_ID}" ]; then
+              git fetch origin pull/${PR_ID}/head
+              git checkout -b PR_${PR_ID} FETCH_HEAD
+              git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git
+              git fetch upstream ${BRANCH}
+              git merge ${BRANCH} --no-edit
+              git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
+            else
+              echo "Not in a pull_request event. Skipping PR-specific operations."
+            fi
+            git log --pretty=oneline -10
+            '
+          fi
+
+      - name: Skip For Bug
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            cd /workspace/PaddleNLP
+            git revert f2477c07272d04244cd3287d1f21c70482a4a85f --no-edit  # 套件PR#10413引入bug-待修复
+            git revert 3e9d3518cbecd8357cec14f059776272713d5c62 --no-edit  # 套件PR#10912引入bug-待修复
+            # rm -rf tests/llm/test_grpo.py tests/llm/test_reinforce_plus_plus.py
+            '
+          fi
+
+      - name: Test
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            ldconfig
+            unlink /usr/bin/python3
+            ln -sf $(which python${python_version}) /usr/bin/python3
+            pip config set global.cache-dir "/home/.cache/pip"
+            set -e
+            source $work_dir/../../../proxy
+            cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
+            export paddle_whl=https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/8ae7423e99b2ea96e410968a0ebb3f1795e37205/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl # 需要套件侧适配Paddle#73283
+            timeout 2h bash scripts/regression/run_ci.sh python${python_version} ${paddle_whl}
+            '
+          fi
+
+      - name: Upload Allure-reports & Logs
+        if: always()
+        env:
+          home_path: ${{ github.workspace }}/../../..
+          bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
+          allure_file: ${{ github.workspace }}/../../../allure-2.19.0/bin/allure
+        run: |
+          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
+            echo "Not in a pull_request or test_build event. Skipping..."
+          else
+            docker exec -t $container_name /bin/bash -c '
+            unset http_proxy && unset https_proxy
+            if [ ! -f "${{ env.bos_file }}" ]; then
+              wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
+              mkdir ${{ env.home_path }}/bos
+              tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
+            fi
+            if [ ! -f "${{ env.allure_file }}" ]; then
+              wget -q --no-proxy -O ${{ env.home_path }}/allure-2.19.0.zip https://xly-devops.bj.bcebos.com/tools/allure-2.19.0.zip --no-check-certificate
+              unzip -q ${{ env.home_path }}/allure-2.19.0.zip -d ${{ env.home_path }}/
+            fi
+            if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
+              bos_prefix="${PR_ID}/${COMMIT_ID}"
+            elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
+              bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
+            else
+              bos_prefix="schedule/$(date +%Y%m%d)"
+            fi
+            cd /workspace/PaddleNLP/model_logs
+            for FILE in /workspace/PaddleNLP/model_logs/*; do
+              file=$(basename "$FILE")
+              python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/llm/${bos_prefix}/logs
+              echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/llm/${bos_prefix}/logs/$file"
+            done
+            cd /workspace/PaddleNLP/
+            ${{ env.allure_file }} generate result -o report
+            tar -czf products.tar.gz report model_logs
+            python ${{ env.bos_file }} products.tar.gz paddle-github-action/PR/PaddleNLP/llm/${bos_prefix}/logs
+            echo "products: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/llm/${bos_prefix}/logs/products.tar.gz"
+            '
+          fi
+
+      - name: Terminate And Delete the Container
+        if: always()
+        run: |
+          docker rm -f ${{ env.container_name }} 2>/dev/null || true