Support fp32 head for qwen and internlm models #5452
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: pr_ete_test | |
| on: | |
| pull_request: | |
| paths: | |
| - ".github/workflows/pr_ete_test.yml" | |
| - "cmake/**" | |
| - "src/**" | |
| - "autotest/**" | |
| - "3rdparty/**" | |
| - "lmdeploy/**" | |
| - "requirements/**" | |
| - "requirements_cuda.txt" | |
| - "CMakeLists.txt" | |
| - "setup.py" | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache | |
| HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai | |
| ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true | |
| PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA | |
| jobs: | |
| pr_functions_test: | |
| runs-on: [self-hosted, linux-a100-pr] | |
| timeout-minutes: 120 | |
| env: | |
| REPORT_DIR: /nvme/qa_test_models/test-reports | |
| container: | |
| image: nvidia/cuda:12.4.1-devel-ubuntu22.04 | |
| options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" | |
| volumes: | |
| - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip | |
| - /nvme/share_data/github-actions/packages:/root/packages | |
| - /nvme/qa_test_models:/nvme/qa_test_models | |
| - /mnt/121:/mnt/121 | |
| - /mnt/104:/mnt/104 | |
| - /mnt/bigdisk:/mnt/bigdisk | |
| - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro | |
| steps: | |
| - name: Setup systems | |
| run: | | |
| apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\ | |
| curl https://sh.rustup.rs -sSf | sh -s -- -y &&\ | |
| add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \ | |
| ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \ | |
| && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3 | |
| echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV" | |
| - name: Clone repository | |
| uses: actions/checkout@v2 | |
| - name: Install pytorch | |
| run: | | |
| python3 -m pip cache dir | |
| python3 -m pip install --upgrade pip setuptools==69.5.1 | |
| - name: Build lmdeploy | |
| run: | | |
| cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz . | |
| tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi | |
| make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5* | |
| export PATH=$PATH:/usr/local/openmpi/bin | |
| export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib | |
| # We need to pin transformers version (<4.52.0) to avoid test failures due to breaking changes. | |
| python3 -m pip install transformers==4.51.3 | |
| python3 -m pip install -r requirements/lite.txt | |
| python3 -m pip install -r requirements/test.txt | |
| python3 -m pip install -e . | |
| # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases | |
| python3 -m pip install /nvme/qa_test_models/offline_pkg/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl | |
| - name: Check env | |
| run: | | |
| python3 -m pip list | |
| lmdeploy check_env | |
| mkdir allure-results | |
| echo "starttime=$(date +%s)" > allure-results/status.txt | |
| - name: Test lmdeploy - func | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir | |
| CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results | |
| - name: Update transformers | |
| run: | | |
| pip install transformers==4.57.3 | |
| - name: Test restful server - turbomind Qwen3-32B | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-32B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > turbomind_Qwen3-32B_start_restful.log 2>&1 & | |
| echo "restful_pid=$!" | |
| for i in $(seq 1 180) | |
| do | |
| sleep 5 | |
| echo "health check try $i" | |
| if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then | |
| pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-32B and turbomind' -m 'not not_turbomind and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results | |
| pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-32B and turbomind' -m 'not not_turbomind and not experts' --alluredir=allure-results | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| exit 0 | |
| fi | |
| done | |
| echo "health check fail" | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| cat turbomind_Qwen3-32B_start_restful.log | |
| exit 1 | |
| - name: Test restful server - turbomind InternVL3-38B | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > turbomind_InternVL3-38B_start_restful.log 2>&1 & | |
| echo "restful_pid=$!" | |
| for i in $(seq 1 180) | |
| do | |
| sleep 5 | |
| echo "health check try $i" | |
| if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then | |
| pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'OpenGVLab/InternVL3-38B and turbomind' -m 'not not_turbomind and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results | |
| pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'OpenGVLab/InternVL3-38B and turbomind' -m 'not not_turbomind and not experts' --alluredir=allure-results | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| exit 0 | |
| fi | |
| done | |
| echo "health check fail" | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| cat turbomind_InternVL3-38B_start_restful.log | |
| exit 1 | |
| - name: Test restful server - turbomind Qwen3-30B-A3B | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client> turbomind_Qwen3-30B-A3B_start_restful.log 2>&1 & | |
| echo "restful_pid=$!" | |
| for i in $(seq 1 180) | |
| do | |
| sleep 5 | |
| echo "health check try $i" | |
| if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then | |
| pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-30B-A3B and turbomind' -m 'not not_turbomind and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results | |
| pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-30B-A3B and turbomind' -m 'not not_turbomind and not experts' --alluredir=allure-results | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| exit 0 | |
| fi | |
| done | |
| echo "health check fail" | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| cat turbomind_Qwen3-30B-A3B_start_restful.log | |
| exit 1 | |
| - name: Test restful server - pytorch Qwen3-30B-A3B | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --enable-return-routed-experts --allow-terminate-by-client > pytorch_Qwen3-30B-A3B_start_restful.log 2>&1 & | |
| echo "restful_pid=$!" | |
| for i in $(seq 1 180) | |
| do | |
| sleep 5 | |
| echo "health check try $i" | |
| if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then | |
| pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-30B-A3B and pytorch' -m 'not not_pytorch and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results | |
| pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-30B-A3B and pytorch' -m 'not not_pytorch' --alluredir=allure-results | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| exit 0 | |
| fi | |
| done | |
| echo "health check fail" | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| cat pytorch_Qwen3-30B-A3B_start_restful.log | |
| exit 1 | |
| - name: Test restful server - pytorch Qwen3-VL-30B-A3B-Instruct | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-VL-30B-A3B-Instruct --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log 2>&1 & | |
| echo "restful_pid=$!" | |
| for i in $(seq 1 180) | |
| do | |
| sleep 5 | |
| echo "health check try $i" | |
| if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then | |
| pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'Qwen/Qwen3-VL-30B-A3B-Instruct and pytorch' -m 'not not_pytorch and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results | |
| pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'Qwen/Qwen3-VL-30B-A3B-Instruct and pytorch' -m 'not not_pytorch and not experts' --alluredir=allure-results | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| exit 0 | |
| fi | |
| done | |
| echo "health check fail" | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| cat pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log | |
| exit 1 | |
| - name: Test restful server - pytorch InternVL3_5-30B-A3B | |
| run: | | |
| CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 & | |
| echo "restful_pid=$!" | |
| for i in $(seq 1 180) | |
| do | |
| sleep 5 | |
| echo "health check try $i" | |
| if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then | |
| pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k 'OpenGVLab/InternVL3_5-30B-A3B and pytorch' -m 'not not_pytorch and not internlm2_5 and not interns1 and pr_test' --alluredir=allure-results | |
| pytest autotest/interface/restful/test_restful_generate.py -n 20 -k 'OpenGVLab/InternVL3_5-30B-A3B and pytorch' -m 'not not_pytorch and not experts' --alluredir=allure-results | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| exit 0 | |
| fi | |
| done | |
| echo "health check fail" | |
| curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 | |
| cat pytorch_InternVL3_5-30B-A3B_start_restful.log | |
| exit 1 | |
| - name: Generate reports | |
| if: always() | |
| run: | | |
| export date_today="$(date +'%Y%m%d-%H%M%S')" | |
| export report_dir="$REPORT_DIR/$date_today" | |
| echo "Save report to $report_dir" | |
| mv allure-results $report_dir | |
| mv *start_restful.log $report_dir | |
| echo "status=done" >> $report_dir/status.txt | |
| - name: Clear workfile | |
| if: always() | |
| run: | | |
| export workdir=$(pwd) | |
| cd .. | |
| rm -rf $workdir | |
| mkdir $workdir | |
| chmod -R 777 $workdir |