PaddleNLP/.github/workflows/llm.yml at abae89257e3fb8b13f318acc057a25ecf60fe2cb · PaddlePaddle/PaddleNLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Disabled LLM CI workflow - manually enabled when needed
name: LLM CI (DISABLED)

#on:
#  pull_request:
#    types: [opened, synchronize, reopened]
#    branches: [develop]
#    schedule:
#    - cron: "2 0 * * *"
#  workflow_call:
#    inputs:
#      run_downstream:
#        required: true
#        type: string
#      image_name:
#        required: true
#        type: string

#concurrency:
#  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
#  cancel-in-progress: true

#env:
#  PR_ID: ${{ github.event.pull_request.number }}
  COMMIT_ID: ${{ github.event.pull_request.head.sha }}
  TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-llm
  ci_scripts: /workspace/PaddleNLP/scripts/regression
  BRANCH: ${{ github.event.pull_request.base.ref }}
  AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
  CI_name: llm-ci
  no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
  HF_ENDPOINT: https://hf-mirror.com
  STUDIO_GIT_HOST: http://git.prod.idc-to-cloud.aistudio.baidu-int.com
  PPNLP_HOME: /ssd1/paddlenlp
  HF_DATASETS_CACHE: /ssd1/paddlenlp/huggingface/datasets
  TRANSFORMERS_CACHE: /ssd1/paddlenlp/huggingface
  CCACHE_DIR: /home/data/gzcfs/.ccache/gpubox
  RUN_DOWNSTREAM: ${{ inputs.run_downstream }}

#defaults:
#  run:
#    shell: bash

#jobs:
#  llm-ci:
#    name: llm-ci (DISABLED)
    runs-on: [self-hosted, ernie-8gpu]
    steps:
      - name: Determine Image Name
        env:
          IMAGE_NAME: ${{ inputs.image_name }}
        run: |
          if [[ -n "${IMAGE_NAME}" ]]; then
            echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
          else
            echo "IMAGE_NAME=iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5-paddlenlp-latest" >> "$GITHUB_ENV"
          fi

      - name: Run Container
        env:
          work_dir: ${{ github.workspace }}
          CACHE_DIR: /home/data/cfs/.cache
          FLAGS_dynamic_static_unified_comm: "True"
          python_version: "3.10"
          paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
        run: |
          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
          echo "container_name=${container_name}" >> "$GITHUB_ENV"
          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
            echo "Not in a pull_request or test_build event. Skipping..."
          else
            docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
              -v $work_dir/../../..:$work_dir/../../.. \
              -v $work_dir:/workspace \
              -v /home/.cache/pip:/home/.cache/pip \
              -v /ssd1/paddlenlp:/ssd1/paddlenlp \
              -v /home/data/gzcfs/.ccache/gpubox:/home/data/gzcfs/.ccache/gpubox \
              -e BRANCH \
              -e AGILE_COMPILE_BRANCH \
              -e PR_ID \
              -e COMMIT_ID \
              -e work_dir \
              -e ci_scripts \
              -e no_proxy \
              -e CI_name \
              -e paddle_whl \
              -e HF_ENDPOINT \
              -e STUDIO_GIT_HOST \
              -e PPNLP_HOME \
              -e HF_DATASETS_CACHE \
              -e TRANSFORMERS_CACHE \
              -e CACHE_DIR \
              -e FLAGS_dynamic_static_unified_comm \
              -e python_version \
              -w /workspace --runtime=nvidia $IMAGE_NAME
          fi

      - name: Download Code
        env:
          work_dir: ${{ github.workspace }}
        run: |
          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
            echo "Not in a pull_request or test_build event. Skipping.."
          else
            docker exec -t $container_name /bin/bash -c '
            rm -rf * .[^.]*
            echo "Downloading PaddleNLP.tar.gz"
            wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
            echo "Extracting PaddleNLP.tar.gz"
            tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
            source $work_dir/../../../proxy
            cd PaddleNLP
            git config --global user.name "PaddleCI"
            git config --global user.email "paddle_ci@example.com"
            git pull
            git submodule update --init --recursive --force
            if [ -n "${PR_ID}" ]; then
              git fetch origin pull/${PR_ID}/head
              git checkout -b PR_${PR_ID} FETCH_HEAD
              git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git
              git fetch upstream ${BRANCH}
              git merge ${BRANCH} --no-edit
              git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
            else
              echo "Not in a pull_request event. Skipping PR-specific operations."
            fi
            git log --pretty=oneline -10
            '
          fi

      - name: Skip For Bug
        run: |
          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
            echo "Not in a pull_request or test_build event. Skipping..."
          else
            docker exec -t $container_name /bin/bash -c '
            cd /workspace/PaddleNLP
            git revert f2477c07272d04244cd3287d1f21c70482a4a85f --no-edit  # 套件PR#10413引入bug-待修复
            git revert d74c950e15a35b7a100d1688c89318195cc83bca --no-edit  # 框架升级后配合修改，当前因框架bug固定commit适配
            '
          fi

      - name: Test
        run: |
          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
            echo "Not in a pull_request or test_build event. Skipping..."
          else
            docker exec -t $container_name /bin/bash -c '
            ldconfig
            unlink /usr/bin/python3
            ln -sf $(which python${python_version}) /usr/bin/python3
            pip config set global.cache-dir "/home/.cache/pip"
            set -e
            source $work_dir/../../../proxy
            cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
            echo "Paddle PR#73283  PR#74484 import bug, set paddle_commit=8ae742 to skip "
            export paddle_whl=https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/8ae7423e99b2ea96e410968a0ebb3f1795e37205/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
            timeout 2h bash scripts/regression/run_ci.sh python${python_version} ${paddle_whl}
            '
          fi

      - name: Upload Allure-reports & Logs
        if: always()
        env:
          home_path: ${{ github.workspace }}/../../..
          bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
          allure_file: ${{ github.workspace }}/../../../allure-2.19.0/bin/allure
        run: |
          if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
            echo "Not in a pull_request or test_build event. Skipping..."
          else
            docker exec -t $container_name /bin/bash -c '
            unset http_proxy && unset https_proxy
            if [ ! -f "${{ env.bos_file }}" ]; then
              wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
              mkdir ${{ env.home_path }}/bos
              tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
            fi
            if [ ! -f "${{ env.allure_file }}" ]; then
              wget -q --no-proxy -O ${{ env.home_path }}/allure-2.19.0.zip https://xly-devops.bj.bcebos.com/tools/allure-2.19.0.zip --no-check-certificate
              unzip -q ${{ env.home_path }}/allure-2.19.0.zip -d ${{ env.home_path }}/
            fi
            if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
              bos_prefix="${PR_ID}/${COMMIT_ID}"
            elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
              bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
            else
              bos_prefix="schedule/$(date +%Y%m%d)"
            fi
            cd /workspace/PaddleNLP/model_logs
            for FILE in /workspace/PaddleNLP/model_logs/*; do
              file=$(basename "$FILE")
              python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/llm/${bos_prefix}/logs
              echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/llm/${bos_prefix}/logs/$file"
            done
            cd /workspace/PaddleNLP/
            ${{ env.allure_file }} generate result -o report
            tar -czf products.tar.gz report model_logs
            python ${{ env.bos_file }} products.tar.gz paddle-github-action/PR/PaddleNLP/llm/${bos_prefix}/logs
            echo "products: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/llm/${bos_prefix}/logs/products.tar.gz"
            '
          fi

      - name: Terminate And Delete the Container
        if: always()
        run: |
          docker rm -f ${{ env.container_name }} 2>/dev/null || true