Skip to content

Commit c875e37

Browse files
[ci] refactor eval into api eval and add h800 eval workflow (#4008)
* TEST: add api evaluate * TEST: rm qwen1.5_7b test * TEST: add evaluate result to github * CI: update workflow docker * TEST: update code based on comments * TEST: update docker * add H800 base model eval * update * update * Update eval_base_config.py * update * update * update max_out_len * set oc data path * update * update * update * update * update * update * update * update * Update evaluate_h800.yml * update * Update eval_base_config.py * update * update * update api outputfolder name --------- Co-authored-by: littlegy <[email protected]>
1 parent 7863313 commit c875e37

18 files changed

+1094
-107
lines changed

.github/scripts/eval_base_config.py

Lines changed: 81 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -39,26 +39,6 @@
3939
wikibench_datasets # noqa: F401, E501
4040
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
4141
winogrande_datasets # noqa: F401, E501
42-
from opencompass.configs.models.baichuan.hf_baichuan_7b import models as hf_baichuan_7b # noqa: F401, E501
43-
from opencompass.configs.models.gemma.hf_gemma_7b import models as hf_gemma_7b # noqa: F401, E501
44-
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import models as hf_internlm2_5_7b # noqa: F401, E501
45-
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b # noqa: F401, E501
46-
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b # noqa: F401, E501
47-
from opencompass.configs.models.hf_internlm.hf_internlm_7b import models as hf_internlm_7b # noqa: F401, E501
48-
from opencompass.configs.models.hf_internlm.hf_internlm_20b import models as hf_internlm_20b # noqa: F401, E501
49-
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
50-
models as lmdeploy_internlm2_5_7b # noqa: F401, E501
51-
from opencompass.configs.models.hf_llama.hf_llama2_7b import models as hf_llama2_7b # noqa: F401, E501
52-
from opencompass.configs.models.hf_llama.hf_llama3_8b import models as hf_llama3_8b # noqa: F401, E501
53-
from opencompass.configs.models.mistral.hf_mistral_7b_v0_1 import models as hf_mistral_7b_v0_1 # noqa: F401, E501
54-
from opencompass.configs.models.mistral.hf_mixtral_8x7b_v0_1 import \
55-
models as hf_mixtral_8x7b_v0_1 # noqa: F401, E501
56-
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import models as lmdeploy_qwen2_5_7b # noqa: F401, E501
57-
from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as hf_qwen1_5_7b # noqa: F401, E501
58-
from opencompass.configs.models.qwen.hf_qwen2_7b import models as hf_qwen2_7b # noqa: F401, E501
59-
from opencompass.configs.models.qwen.hf_qwen_7b import models as hf_qwen_7b # noqa: F401, E501
60-
from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b import models as lmdeploy_qwen1_5_7b # noqa: F401, E501
61-
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import models as lmdeploy_qwen2_7b # noqa: F401, E501
6242
# Summary Groups
6343
from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups # noqa: F401, E501
6444
from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups # noqa: F401, E501
@@ -69,6 +49,14 @@
6949

7050
# read models
7151
race_datasets = [race_datasets[1]]
52+
mmlu_datasets = [
53+
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
54+
'business_ethics', 'clinical_knowledge', 'college_medicine', 'global_facts', 'human_aging', 'management',
55+
'marketing', 'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
56+
'professional_medicine', 'virology'
57+
]
58+
]
59+
7260
summarizer = dict(
7361
dataset_abbrs=[
7462
['race-high', 'accuracy'],
@@ -138,49 +126,89 @@
138126
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
139127
)
140128

141-
turbomind_qwen1_5_7b = deepcopy(*lmdeploy_qwen1_5_7b)
142-
turbomind_qwen2_7b = deepcopy(*lmdeploy_qwen2_7b)
143-
turbomind_qwen2_5_7b = deepcopy(*lmdeploy_qwen2_5_7b)
144-
turbomind_qwen2_5_14b = deepcopy(*lmdeploy_qwen2_5_7b)
145-
turbomind_qwen2_5_14b['path'] = 'Qwen/Qwen2.5-14B'
146-
turbomind_internlm2_5_7b = deepcopy(*lmdeploy_internlm2_5_7b)
147-
turbomind_internlm2_5_7b_4bits = deepcopy(*lmdeploy_internlm2_5_7b)
148-
turbomind_internlm2_5_7b_batch1 = deepcopy(*lmdeploy_internlm2_5_7b)
149-
turbomind_internlm2_5_7b_batch1_4bits = deepcopy(*lmdeploy_internlm2_5_7b)
150-
151129
base_model = dict(
152130
type=TurboMindModel,
153-
engine_config=dict(session_len=7168, max_batch_size=128, tp=1),
131+
engine_config=dict(session_len=7168, tp=1),
154132
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
155133
max_seq_len=7168,
156134
max_out_len=1024,
157-
batch_size=128,
135+
batch_size=32,
158136
run_cfg=dict(num_gpus=1),
159137
)
160138

139+
turbomind_qwen2_5_1_5b = deepcopy(base_model)
140+
turbomind_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B'
141+
turbomind_qwen2_5_1_5b['abbr'] = 'turbomind_qwen2_5_1_5b'
142+
turbomind_qwen2_5_7b = deepcopy(base_model)
143+
turbomind_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B'
144+
turbomind_qwen2_5_7b['abbr'] = 'turbomind_qwen2_5_7b'
145+
turbomind_qwen2_5_32b = deepcopy(base_model)
146+
turbomind_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B'
147+
turbomind_qwen2_5_32b['abbr'] = 'turbomind_qwen2_5_32b'
148+
turbomind_qwen2_5_32b['run_cfg']['num_gpus'] = 2
149+
turbomind_qwen2_5_32b['engine_config']['tp'] = 2
150+
turbomind_internlm2_5_7b = deepcopy(base_model)
151+
turbomind_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat'
152+
turbomind_internlm2_5_7b['abbr'] = 'turbomind_internlm2_5_7b'
153+
turbomind_glm_4_9b = deepcopy(base_model)
154+
turbomind_glm_4_9b['path'] = 'THUDM/glm-4-9b'
155+
turbomind_glm_4_9b['abbr'] = 'turbomind_glm_4_9b'
156+
turbomind_llama_3_70b = deepcopy(base_model)
157+
turbomind_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B'
158+
turbomind_llama_3_70b['abbr'] = 'turbomind_llama_3_70b'
159+
turbomind_llama_3_70b['run_cfg']['num_gpus'] = 4
160+
turbomind_llama_3_70b['engine_config']['tp'] = 4
161+
turbomind_llama_3_1_8b = deepcopy(base_model)
162+
turbomind_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B'
163+
turbomind_llama_3_1_8b['abbr'] = 'turbomind_llama_3_1_8b'
164+
turbomind_qwen3_0_6b_base = deepcopy(base_model)
165+
turbomind_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base'
166+
turbomind_qwen3_0_6b_base['abbr'] = 'turbomind_qwen3_0_6b_base'
161167
turbomind_qwen3_8b_base = deepcopy(base_model)
162-
pytorch_qwen3_8b_base = deepcopy(base_model)
163-
turbomind_qwen3_8b_base_4bits = deepcopy(base_model)
164-
turbomind_qwen3_8b_base_kvint8 = deepcopy(base_model)
165-
for model in [
166-
v for k, v in locals().items()
167-
if k.startswith('turbomind_qwen3_8b_base') or k.startswith('pytorch_qwen3_8b_base')
168-
]:
169-
model['abbr'] = 'qwen3_8b_base_turbomind'
170-
model['path'] = 'Qwen/Qwen3-8B-Base'
171-
model['run_cfg']['num_gpus'] = 1
172-
model['engine_config']['tp'] = 1
168+
turbomind_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base'
169+
turbomind_qwen3_8b_base['abbr'] = 'turbomind_qwen3_8b_base'
170+
turbomind_qwen3_30b_A3B_base = deepcopy(base_model)
171+
turbomind_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base'
172+
turbomind_qwen3_30b_A3B_base['abbr'] = 'turbomind_qwen3_30b_A3B_base'
173+
turbomind_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2
174+
turbomind_qwen3_30b_A3B_base['engine_config']['tp'] = 2
173175

174-
for model in [v for k, v in locals().items() if k.endswith('_4bits')]:
175-
model['engine_config']['model_format'] = 'awq'
176-
model['abbr'] = model['abbr'] + '_4bits'
177-
model['path'] = model['path'] + '-inner-4bits'
178-
179-
for model in [v for k, v in locals().items() if '_batch1' in k]:
180-
model['abbr'] = model['abbr'] + '_batch1'
181-
model['engine_config']['max_batch_size'] = 1
182-
model['batch_size'] = 1
176+
pytorch_qwen2_5_1_5b = deepcopy(base_model)
177+
pytorch_qwen2_5_1_5b['path'] = 'Qwen/Qwen2.5-1.5B'
178+
pytorch_qwen2_5_1_5b['abbr'] = 'pytorch_qwen2_5_1_5b'
179+
pytorch_qwen2_5_7b = deepcopy(base_model)
180+
pytorch_qwen2_5_7b['path'] = 'Qwen/Qwen2.5-7B'
181+
pytorch_qwen2_5_7b['abbr'] = 'pytorch_qwen2_5_7b'
182+
pytorch_qwen2_5_32b = deepcopy(base_model)
183+
pytorch_qwen2_5_32b['path'] = 'Qwen/Qwen2.5-32B'
184+
pytorch_qwen2_5_32b['abbr'] = 'pytorch_qwen2_5_32b'
185+
pytorch_qwen2_5_32b['run_cfg']['num_gpus'] = 2
186+
pytorch_qwen2_5_32b['engine_config']['tp'] = 2
187+
pytorch_internlm2_5_7b = deepcopy(base_model)
188+
pytorch_internlm2_5_7b['path'] = 'internlm/internlm2_5-7b-chat'
189+
pytorch_internlm2_5_7b['abbr'] = 'pytorch_internlm2_5_7b'
190+
pytorch_gemma_2_9b = deepcopy(base_model)
191+
pytorch_gemma_2_9b['path'] = 'google/gemma-2-9b'
192+
pytorch_gemma_2_9b['abbr'] = 'pytorch_gemma_2_9b'
193+
pytorch_llama_3_70b = deepcopy(base_model)
194+
pytorch_llama_3_70b['path'] = 'meta-llama/Meta-Llama-3-70B'
195+
pytorch_llama_3_70b['abbr'] = 'pytorch_llama_3_70b'
196+
pytorch_llama_3_70b['run_cfg']['num_gpus'] = 4
197+
pytorch_llama_3_70b['engine_config']['tp'] = 4
198+
pytorch_llama_3_1_8b = deepcopy(base_model)
199+
pytorch_llama_3_1_8b['path'] = 'meta-llama/Llama-3.1-8B'
200+
pytorch_llama_3_1_8b['abbr'] = 'pytorch_llama_3_1_8b'
201+
pytorch_qwen3_0_6b_base = deepcopy(base_model)
202+
pytorch_qwen3_0_6b_base['path'] = 'Qwen/Qwen3-0.6B-Base'
203+
pytorch_qwen3_0_6b_base['abbr'] = 'pytorch_qwen3_0_6b_base'
204+
pytorch_qwen3_8b_base = deepcopy(base_model)
205+
pytorch_qwen3_8b_base['path'] = 'Qwen/Qwen3-8B-Base'
206+
pytorch_qwen3_8b_base['abbr'] = 'pytorch_qwen3_8b_base'
207+
pytorch_qwen3_30b_A3B_base = deepcopy(base_model)
208+
pytorch_qwen3_30b_A3B_base['path'] = 'Qwen/Qwen3-30B-A3B-Base'
209+
pytorch_qwen3_30b_A3B_base['abbr'] = 'pytorch_qwen3_30b_A3B_base'
210+
pytorch_qwen3_30b_A3B_base['run_cfg']['num_gpus'] = 2
211+
pytorch_qwen3_30b_A3B_base['engine_config']['tp'] = 2
183212

184213
for model in [v for k, v in locals().items() if k.startswith('pytorch_')]:
185-
model['abbr'] = model['abbr'].replace('turbomind', 'pytorch')
186214
model['backend'] = 'pytorch'

.github/workflows/api_eval.yml

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
name: api_eval
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
repo_org:
7+
required: false
8+
description: 'Tested repository organization name. Default is InternLM/lmdeploy'
9+
type: string
10+
default: 'InternLM/lmdeploy'
11+
repo_ref:
12+
required: false
13+
description: 'Set branch or tag or commit id. Default is "main"'
14+
type: string
15+
default: 'main'
16+
backend:
17+
required: true
18+
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
19+
type: string
20+
default: "['turbomind', 'pytorch']"
21+
22+
23+
env:
24+
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
25+
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
26+
OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
27+
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
28+
REPORT_DIR: /nvme/qa_test_models/test-reports/${{ github.run_id }}
29+
COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
30+
FAIL_CONFIG: '--lf'
31+
TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
32+
OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
33+
OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
34+
DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
35+
COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
36+
37+
jobs:
38+
linux-build:
39+
if: ${{ !cancelled() }}
40+
strategy:
41+
matrix:
42+
pyver: [py310]
43+
runs-on: ubuntu-latest
44+
env:
45+
PYTHON_VERSION: ${{ matrix.pyver }}
46+
PLAT_NAME: manylinux2014_x86_64
47+
DOCKER_TAG: cuda12.4
48+
OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
49+
steps:
50+
- name: Checkout repository
51+
uses: actions/checkout@v3
52+
with:
53+
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
54+
ref: ${{github.event.inputs.repo_ref || 'main'}}
55+
- name: Build
56+
run: |
57+
echo ${PYTHON_VERSION}
58+
echo ${PLAT_NAME}
59+
echo ${DOCKER_TAG}
60+
echo ${OUTPUT_FOLDER}
61+
echo ${GITHUB_RUN_ID}
62+
# remove -it
63+
sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
64+
bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
65+
- name: Upload Artifacts
66+
uses: actions/upload-artifact@v4
67+
with:
68+
if-no-files-found: error
69+
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
70+
retention-days: 1
71+
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
72+
73+
test_evaluation:
74+
needs: linux-build
75+
if: ${{ !cancelled() }}
76+
runs-on: [self-hosted, test-140]
77+
timeout-minutes: 2400
78+
strategy:
79+
fail-fast: false
80+
matrix:
81+
backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
82+
container:
83+
image: openmmlab/lmdeploy:latest-cu12
84+
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
85+
volumes:
86+
- /nvme/github-actions/pip-cache:/root/.cache/pip
87+
- /nvme/github-actions/packages:/root/packages
88+
- /nvme/github-actions/resources:/root/resources
89+
- /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
90+
- /nvme/qa_test_models:/nvme/qa_test_models
91+
- /mnt/shared:/mnt/shared
92+
- /mnt/bigdisk:/mnt/bigdisk
93+
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
94+
- /mnt/187:/mnt/187
95+
steps:
96+
- name: Create and change to _wk directory
97+
run: |
98+
echo "Working directory set to: $(pwd)"
99+
- name: Clone repository
100+
uses: actions/checkout@v2
101+
with:
102+
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
103+
ref: ${{github.event.inputs.repo_ref || 'main'}}
104+
- name: Download Artifacts
105+
uses: actions/download-artifact@v4
106+
with:
107+
name: my-artifact-${{ github.run_id }}-py310
108+
- name: Install lmdeploy - dependency
109+
run: |
110+
python3 -m pip install -r requirements_cuda.txt
111+
python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
112+
- name: Install lmdeploy
113+
run: |
114+
python3 -m pip install lmdeploy-*.whl --no-deps
115+
python3 -m pip install -r requirements/test.txt
116+
- name: Install opencompass
117+
run: |
118+
python3 -m pip install opencompass
119+
- name: Check env
120+
run: |
121+
python3 -m pip list
122+
lmdeploy check_env
123+
rm -rf allure-results
124+
mkdir -p ${{ env.REPORT_DIR }}/.pytest_cache
125+
ln -s ${{ env.REPORT_DIR }}/.pytest_cache autotest
126+
- name: Setup paths for evaluation
127+
if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
128+
run: |
129+
overall_exit=0
130+
ln -s /mnt/187/opencompass-data/data ./data
131+
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and not pr_test and ${{matrix.backend}}" -n 8 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
132+
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and not pr_test and ${{matrix.backend}}" -n 4 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
133+
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and not pr_test and ${{matrix.backend}}" -n 2 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
134+
pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and not pr_test and ${{matrix.backend}}" -n 1 --run_id ${{ github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
135+
exit $overall_exit
136+
- name: Clear workspace
137+
if: always()
138+
run: |
139+
export workdir=$(pwd)
140+
rm -rf $workdir/*

0 commit comments

Comments
 (0)