Skip to content

Support fp32 head for qwen and internlm models #5452

Support fp32 head for qwen and internlm models

Support fp32 head for qwen and internlm models #5452

Workflow file for this run

name: pr_ete_test
on:
pull_request:
paths:
- ".github/workflows/pr_ete_test.yml"
- "cmake/**"
- "src/**"
- "autotest/**"
- "3rdparty/**"
- "lmdeploy/**"
- "requirements/**"
- "requirements_cuda.txt"
- "CMakeLists.txt"
- "setup.py"
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
jobs:
pr_functions_test:
runs-on: [self-hosted, linux-a100-pr]
timeout-minutes: 120
env:
REPORT_DIR: /nvme/qa_test_models/test-reports
container:
image: nvidia/cuda:12.4.1-devel-ubuntu22.04
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
volumes:
- /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
- /nvme/share_data/github-actions/packages:/root/packages
- /nvme/qa_test_models:/nvme/qa_test_models
- /mnt/121:/mnt/121
- /mnt/104:/mnt/104
- /mnt/bigdisk:/mnt/bigdisk
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Setup systems
run: |
apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\
curl https://sh.rustup.rs -sSf | sh -s -- -y &&\
add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
- name: Clone repository
uses: actions/checkout@v2
- name: Install pytorch
run: |
python3 -m pip cache dir
python3 -m pip install --upgrade pip setuptools==69.5.1
- name: Build lmdeploy
run: |
cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz .
tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi
make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5*
export PATH=$PATH:/usr/local/openmpi/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
# We need to pin transformers version (<4.52.0) to avoid test failures due to breaking changes.
python3 -m pip install transformers==4.51.3
python3 -m pip install -r requirements/lite.txt
python3 -m pip install -r requirements/test.txt
python3 -m pip install -e .
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
python3 -m pip install /nvme/qa_test_models/offline_pkg/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
- name: Check env
run: |
python3 -m pip list
lmdeploy check_env
mkdir allure-results
echo "starttime=$(date +%s)" > allure-results/status.txt
- name: Test lmdeploy - func
run: |
CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir
CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results
- name: Update transformers
run: |
pip install transformers==4.57.3
- name: Test restful server - turbomind Qwen3-32B
run: |
CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-32B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > turbomind_Qwen3-32B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
sleep 5
echo "health check try $i"
if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-32B and turbomind' -m 'not not_turbomind and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results
pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-32B and turbomind' -m 'not not_turbomind and not experts' --alluredir=allure-results
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
exit 0
fi
done
echo "health check fail"
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
cat turbomind_Qwen3-32B_start_restful.log
exit 1
- name: Test restful server - turbomind InternVL3-38B
run: |
CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > turbomind_InternVL3-38B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
sleep 5
echo "health check try $i"
if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'OpenGVLab/InternVL3-38B and turbomind' -m 'not not_turbomind and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results
pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'OpenGVLab/InternVL3-38B and turbomind' -m 'not not_turbomind and not experts' --alluredir=allure-results
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
exit 0
fi
done
echo "health check fail"
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
cat turbomind_InternVL3-38B_start_restful.log
exit 1
- name: Test restful server - turbomind Qwen3-30B-A3B
run: |
CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client> turbomind_Qwen3-30B-A3B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
sleep 5
echo "health check try $i"
if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-30B-A3B and turbomind' -m 'not not_turbomind and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results
pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-30B-A3B and turbomind' -m 'not not_turbomind and not experts' --alluredir=allure-results
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
exit 0
fi
done
echo "health check fail"
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
cat turbomind_Qwen3-30B-A3B_start_restful.log
exit 1
- name: Test restful server - pytorch Qwen3-30B-A3B
run: |
CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --enable-return-routed-experts --allow-terminate-by-client > pytorch_Qwen3-30B-A3B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
sleep 5
echo "health check try $i"
if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-30B-A3B and pytorch' -m 'not not_pytorch and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results
pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-30B-A3B and pytorch' -m 'not not_pytorch' --alluredir=allure-results
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
exit 0
fi
done
echo "health check fail"
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
cat pytorch_Qwen3-30B-A3B_start_restful.log
exit 1
- name: Test restful server - pytorch Qwen3-VL-30B-A3B-Instruct
run: |
CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-VL-30B-A3B-Instruct --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
sleep 5
echo "health check try $i"
if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-VL-30B-A3B-Instruct and pytorch' -m 'not not_pytorch and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results
pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-VL-30B-A3B-Instruct and pytorch' -m 'not not_pytorch and not experts' --alluredir=allure-results
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
exit 0
fi
done
echo "health check fail"
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
cat pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log
exit 1
- name: Test restful server - pytorch InternVL3_5-30B-A3B
run: |
CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
echo "restful_pid=$!"
for i in $(seq 1 180)
do
sleep 5
echo "health check try $i"
if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'OpenGVLab/InternVL3_5-30B-A3B and pytorch' -m 'not not_pytorch and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results
pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'OpenGVLab/InternVL3_5-30B-A3B and pytorch' -m 'not not_pytorch and not experts' --alluredir=allure-results
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
exit 0
fi
done
echo "health check fail"
curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
cat pytorch_InternVL3_5-30B-A3B_start_restful.log
exit 1
- name: Generate reports
if: always()
run: |
export date_today="$(date +'%Y%m%d-%H%M%S')"
export report_dir="$REPORT_DIR/$date_today"
echo "Save report to $report_dir"
mv allure-results $report_dir
mv *start_restful.log $report_dir
echo "status=done" >> $report_dir/status.txt
- name: Clear workfile
if: always()
run: |
export workdir=$(pwd)
cd ..
rm -rf $workdir
mkdir $workdir
chmod -R 777 $workdir