Skip to content

[CI]add workflow for llm&unittest-gpu #10878

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions .github/workflows/distribute-a100.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
name: Distribute CI (A100)

on:
pull_request:
types: [opened, synchronize, reopened]
branches: [develop]
schedule:
- cron: "1 0 * * *"
workflow_call:
inputs:
run_downstream:
required: true
type: string
image_name:
required: true
type: string


concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true

env:
PR_ID: ${{ github.event.pull_request.number }}
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-Distribut-A100
ci_scripts: /workspace/PaddleNLP/scripts/distribute
BRANCH: ${{ github.event.pull_request.base.ref }}
AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
CI_name: distribute-ci
no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
GITHUB_EVENT_NAME: ${{ github.event_name }}
RUN_DOWNSTREAM: ${{ inputs.run_downstream }}

defaults:
run:
shell: bash

jobs:
distribute-a100-ci:
name: distribute-a100-ci
runs-on:
group: Distribute
steps:
- name: Determine Image Name
env:
IMAGE_NAME: ${{ inputs.image_name }}
run: |
if [[ -n "${IMAGE_NAME}" ]]; then
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
else
echo "IMAGE_NAME=registry.baidubce.com/paddlepaddle/paddle:latest-dev-cuda11.8-cudnn8.6-trt8.5-gcc82" >> "$GITHUB_ENV"
fi

- name: Run Container
env:
work_dir: ${{ github.workspace }}
CACHE_DIR: /home/data/cfs/.cache
FLAGS_dynamic_static_unified_comm: "True"
FLAGS_dataloader_use_file_descriptor: "False"
python_version: "3.10"
paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> "$GITHUB_ENV"
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
export CUDA_SO="$(\ls -d /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls -d /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES="$(\ls -d /dev/nvidia* | xargs -I{} echo "-v {}:{}") $(\ls /dev/nvidia-caps/* | xargs -I{} echo "-v {}:{}")"
export SMI="-v /usr/bin/nvidia-smi:/usr/bin/nvidia-smi"
docker run -d -t --name ${container_name} ${CUDA_SO} ${DEVICES} ${SMI} --runtime=nvidia --shm-size=32G \
--network host -v /dev/shm:/dev/shm \
-v $work_dir/../../..:$work_dir/../../.. \
-v $work_dir:/workspace \
-v /home/.cache/pip:/home/.cache/pip \
-v /home/FleetX_CI:/fleetx_data \
-v /home/Llm_gpt_CI:/llm_gpt_data \
-v /home/Llama_CI:/llama_data \
-e BRANCH \
-e AGILE_COMPILE_BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e work_dir \
-e ci_scripts \
-e no_proxy \
-e CI_name \
-e paddle_whl \
-e FLAGS_dynamic_static_unified_comm \
-e FLAGS_dataloader_use_file_descriptor \
-e python_version \
-w /workspace $IMAGE_NAME
fi

- name: Download Code
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping.."
else
docker exec -t $container_name /bin/bash -c '
rm -rf * .[^.]*
echo "Downloading PaddleNLP.tar.gz"
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
echo "Extracting PaddleNLP.tar.gz"
tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
source $work_dir/../../../proxy
cd PaddleNLP
git config --global user.name "PaddleCI"
git config --global user.email "[email protected]"
git pull
git submodule update --init --recursive --force
if [ -n "${PR_ID}" ]; then
git fetch origin pull/${PR_ID}/head
git checkout -b PR_${PR_ID} FETCH_HEAD
git remote add upstream https://github.com/PaddlePaddle/PaddleFormers.git
git fetch upstream ${BRANCH}
git merge ${BRANCH} --no-edit
git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
else
echo "Not in a pull_request event. Skipping PR-specific operations."
fi
git log --pretty=oneline -10
'
fi

- name: Test
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
ldconfig
ln -sf $(which python${python_version}) /usr/bin/python
pip config set global.cache-dir "/home/.cache/pip"
source $work_dir/../../../proxy
set -e
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
timeout 80m bash scripts/distribute/run_ci.sh ${paddle_whl}
'
fi

- name: Upload Logs
if: always()
env:
home_path: ${{ github.workspace }}/..
bos_file: ${{ github.workspace }}/../bos_retry/BosClient.py
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
unset http_proxy && unset https_proxy
if [ ! -f "${{ env.bos_file }}" ]; then
wget -q --no-proxy -O ${{ env.home_path }}/bos_retry.tar.gz https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
mkdir ${{ env.home_path }}/bos_retry
tar xf ${{ env.home_path }}/bos_retry.tar.gz -C ${{ env.home_path }}/bos_retry
fi

if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}"
elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
else
bos_prefix="schedule/$(date +%Y%m%d)"
fi

cd /workspace/case_logs
for FILE in /workspace/case_logs/*; do
file=$(basename "$FILE")
python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs
echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/distribute-a100/${bos_prefix}/logs/$file"
done
'
fi

- name: Terminate And Delete the Container
if: always()
run: |
docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
docker rm -f $container_name 2>/dev/null || true
206 changes: 206 additions & 0 deletions .github/workflows/llm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
name: LLM CI

on:
pull_request:
types: [opened, synchronize, reopened]
branches: [develop]
schedule:
- cron: "2 0 * * *"
workflow_call:
inputs:
run_downstream:
required: true
type: string
image_name:
required: true
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true

env:
PR_ID: ${{ github.event.pull_request.number }}
COMMIT_ID: ${{ github.event.pull_request.head.sha }}
TASK: paddlenlp-CI-${{ github.event.pull_request.number }}-llm
ci_scripts: /workspace/PaddleNLP/scripts/regression
BRANCH: ${{ github.event.pull_request.base.ref }}
AGILE_COMPILE_BRANCH: ${{ github.event.pull_request.base.ref }}
CI_name: llm-ci
no_proxy: "localhost,bj.bcebos.com,su.bcebos.com,bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn"
HF_ENDPOINT: https://hf-mirror.com
STUDIO_GIT_HOST: http://git.prod.idc-to-cloud.aistudio.baidu-int.com
PPNLP_HOME: /ssd1/paddlenlp
HF_DATASETS_CACHE: /ssd1/paddlenlp/huggingface/datasets
TRANSFORMERS_CACHE: /ssd1/paddlenlp/huggingface
CCACHE_DIR: /home/data/gzcfs/.ccache/gpubox
RUN_DOWNSTREAM: ${{ inputs.run_downstream }}

defaults:
run:
shell: bash

jobs:
llm-ci:
name: llm-ci
runs-on: [self-hosted, ernie-8gpu]
steps:
- name: Determine Image Name
env:
IMAGE_NAME: ${{ inputs.image_name }}
run: |
if [[ -n "${IMAGE_NAME}" ]]; then
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_ENV"
else
echo "IMAGE_NAME=iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu18.04-gcc8.2-cuda11.8-cudnn8.6-nccl2.15.5-paddlenlp-latest" >> "$GITHUB_ENV"
fi

- name: Run Container
env:
work_dir: ${{ github.workspace }}
CACHE_DIR: /home/data/cfs/.cache
FLAGS_dynamic_static_unified_comm: "True"
python_version: "3.10"
paddle_whl: https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
run: |
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
echo "container_name=${container_name}" >> "$GITHUB_ENV"
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
-v $work_dir/../../..:$work_dir/../../.. \
-v $work_dir:/workspace \
-v /home/.cache/pip:/home/.cache/pip \
-v /ssd1/paddlenlp:/ssd1/paddlenlp \
-v /home/data/gzcfs/.ccache/gpubox:/home/data/gzcfs/.ccache/gpubox \
-e BRANCH \
-e AGILE_COMPILE_BRANCH \
-e PR_ID \
-e COMMIT_ID \
-e work_dir \
-e ci_scripts \
-e no_proxy \
-e CI_name \
-e paddle_whl \
-e HF_ENDPOINT \
-e STUDIO_GIT_HOST \
-e PPNLP_HOME \
-e HF_DATASETS_CACHE \
-e TRANSFORMERS_CACHE \
-e CACHE_DIR \
-e FLAGS_dynamic_static_unified_comm \
-e python_version \
-w /workspace --runtime=nvidia $IMAGE_NAME
fi

- name: Download Code
env:
work_dir: ${{ github.workspace }}
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping.."
else
docker exec -t $container_name /bin/bash -c '
rm -rf * .[^.]*
echo "Downloading PaddleNLP.tar.gz"
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleNLP.tar --no-check-certificate
echo "Extracting PaddleNLP.tar.gz"
tar xf PaddleNLP.tar && rm -rf PaddleNLP.tar
source $work_dir/../../../proxy
cd PaddleNLP
git config --global user.name "PaddleCI"
git config --global user.email "[email protected]"
git pull
git submodule update --init --recursive --force
if [ -n "${PR_ID}" ]; then
git fetch origin pull/${PR_ID}/head
git checkout -b PR_${PR_ID} FETCH_HEAD
git remote add upstream https://github.com/PaddlePaddle/PaddleNLP.git
git fetch upstream ${BRANCH}
git merge ${BRANCH} --no-edit
git diff --numstat ${BRANCH} -- | awk "{print \$NF}"
else
echo "Not in a pull_request event. Skipping PR-specific operations."
fi
git log --pretty=oneline -10
'
fi

- name: Skip For Bug
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
cd /workspace/PaddleNLP
git revert f2477c07272d04244cd3287d1f21c70482a4a85f --no-edit # 套件PR#10413引入bug-待修复
git revert 3e9d3518cbecd8357cec14f059776272713d5c62 --no-edit # 套件PR#10912引入bug-待修复
# rm -rf tests/llm/test_grpo.py tests/llm/test_reinforce_plus_plus.py
'
fi

- name: Test
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
ldconfig
unlink /usr/bin/python3
ln -sf $(which python${python_version}) /usr/bin/python3
pip config set global.cache-dir "/home/.cache/pip"
set -e
source $work_dir/../../../proxy
cd /workspace/PaddleNLP && git config --global --add safe.directory $PWD
export paddle_whl=https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuSome-LinuxCentos-Gcc82-Cuda118-Cudnn86-Trt85-Py310-CINN-Compile/8ae7423e99b2ea96e410968a0ebb3f1795e37205/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl # 需要套件侧适配Paddle#73283
timeout 2h bash scripts/regression/run_ci.sh python${python_version} ${paddle_whl}
'
fi

- name: Upload Allure-reports & Logs
if: always()
env:
home_path: ${{ github.workspace }}/../../..
bos_file: ${{ github.workspace }}/../../../bos/BosClient.py
allure_file: ${{ github.workspace }}/../../../allure-2.19.0/bin/allure
run: |
if [[ "$RUN_DOWNSTREAM" == "false" ]]; then
echo "Not in a pull_request or test_build event. Skipping..."
else
docker exec -t $container_name /bin/bash -c '
unset http_proxy && unset https_proxy
if [ ! -f "${{ env.bos_file }}" ]; then
wget -q --no-proxy -O ${{ env.home_path }}/bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
mkdir ${{ env.home_path }}/bos
tar xf ${{ env.home_path }}/bos_new.tar.gz -C ${{ env.home_path }}/bos
fi
if [ ! -f "${{ env.allure_file }}" ]; then
wget -q --no-proxy -O ${{ env.home_path }}/allure-2.19.0.zip https://xly-devops.bj.bcebos.com/tools/allure-2.19.0.zip --no-check-certificate
unzip -q ${{ env.home_path }}/allure-2.19.0.zip -d ${{ env.home_path }}/
fi
if [[ "${{ env.RUN_DOWNSTREAM }}" == "" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}"
elif [[ "${{ env.RUN_DOWNSTREAM }}" == "true" && -n "${PR_ID}" ]]; then
bos_prefix="${PR_ID}/${COMMIT_ID}/test_build"
else
bos_prefix="schedule/$(date +%Y%m%d)"
fi
cd /workspace/PaddleNLP/model_logs
for FILE in /workspace/PaddleNLP/model_logs/*; do
file=$(basename "$FILE")
python ${{ env.bos_file }} $file paddle-github-action/PR/PaddleNLP/llm/${bos_prefix}/logs
echo "$file: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/llm/${bos_prefix}/logs/$file"
done
cd /workspace/PaddleNLP/
${{ env.allure_file }} generate result -o report
tar -czf products.tar.gz report model_logs
python ${{ env.bos_file }} products.tar.gz paddle-github-action/PR/PaddleNLP/llm/${bos_prefix}/logs
echo "products: https://paddle-github-action.bj.bcebos.com/PR/PaddleNLP/llm/${bos_prefix}/logs/products.tar.gz"
'
fi

- name: Terminate And Delete the Container
if: always()
run: |
docker rm -f ${{ env.container_name }} 2>/dev/null || true
Loading
Loading