Skip to content

Commit 4fee422

Browse files
[ci] add ifbench and lcb_pro into daily testcase (#2369)
* update * update * update * update * update
1 parent 2bf3bc2 commit 4fee422

File tree

3 files changed

+23
-7
lines changed

3 files changed

+23
-7
lines changed

.github/scripts/eval_regression_chat_obj_fullbench_v8.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,18 @@
1212
biodata_task_datasets # noqa: F401, E501
1313
from opencompass.configs.datasets.CMPhysBench.cmphysbench_gen import \
1414
cmphysbench_datasets # noqa: F401, E501
15+
from opencompass.configs.datasets.IFBench.IFBench_gen import \
16+
ifbench_datasets # noqa: F401, E501
17+
from opencompass.configs.datasets.livecodebench_pro.livecodebench_pro_gen import \
18+
lcb_pro_datasets # noqa: F401, E501
1519
from opencompass.configs.datasets.MolInstructions_chem.mol_instructions_chem_gen import \
1620
mol_gen_selfies_datasets # noqa: F401, E501
1721
from opencompass.configs.datasets.openswi.openswi_gen import \
1822
openswi_datasets # noqa: F401, E501
1923

2024
from ...rjob import eval, infer # noqa: F401, E501
2125

22-
datasets = [
23-
*atlas_datasets, *biodata_task_datasets, *cmphysbench_datasets,
24-
*mol_gen_selfies_datasets, *openswi_datasets
25-
]
26+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
2627

2728
for d in datasets:
2829
if 'n' in d:

.github/scripts/oc_score_baseline_fullbench.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,13 @@ qwen-3-8b-hf-fullbench:
352352
OpenSWI-shallow-1k_valid: 18.75
353353
OpenSWI-deep-1k_score: 1069.07
354354
OpenSWI-deep-1k_valid: 6.25
355+
IFBench_score: 25
356+
IFBench_Prompt-level-strict-accuracy: 25
357+
IFBench_Inst-level-strict-accuracy: 25
358+
IFBench_Prompt-level-loose-accuracy: 25
359+
IFBench_Inst-level-loose-accuracy: 25
360+
lcb_pro_accuracy: 25
361+
lcb_pro_pass@1: 25
355362
chat_subjective:
356363
alignment_bench_v1_1_总分: 0.46
357364
arenahard_score: 100
@@ -740,6 +747,13 @@ qwen-3-8b-fullbench:
740747
OpenSWI-shallow-1k_valid: 6.25
741748
OpenSWI-deep-1k_score: 861.96
742749
OpenSWI-deep-1k_valid: 0
750+
IFBench_score: 25
751+
IFBench_Prompt-level-strict-accuracy: 25
752+
IFBench_Inst-level-strict-accuracy: 25
753+
IFBench_Prompt-level-loose-accuracy: 25
754+
IFBench_Inst-level-loose-accuracy: 25
755+
lcb_pro_accuracy: 37.5
756+
lcb_pro_pass@1: 37.5
743757
chat_longtext:
744758
babilong_qa1_256k_score: 0.00
745759
LongBench_2wikimqa_score: 5.43

.github/workflows/daily-run-test.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,9 @@ env:
4242
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
4343
REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/regression_test
4444
COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache
45-
HF_DATASETS_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache
45+
HF_DATASETS_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache
4646
HF_HUB_CACHE: /mnt/shared-storage-user/large-model-center-share-weights/hf_hub
47+
HF_DATASETS_DISABLE_LOCKFILES: 1
4748
HF_ENDPOINT: https://hf-mirror.com
4849
TMPDIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/tmpdir
4950
PIP_CACHE_DIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pip_cache
@@ -222,7 +223,7 @@ jobs:
222223
. ${{env.CONDA_PATH}}/bin/activate
223224
conda activate ${{env.CONDA_ENV}}
224225
conda info --envs
225-
rjob submit --metadata-name=cmd-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}'
226+
rjob submit --metadata-name=cmd-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}'
226227
227228
for i in {1..300}; do
228229
current_status=$(rjob get cmd-${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+')
@@ -277,7 +278,7 @@ jobs:
277278
. ${{env.CONDA_PATH}}/bin/activate
278279
conda activate ${{env.CONDA_ENV}}
279280
conda info --envs
280-
rjob submit --metadata-name=${{matrix.regression_func}}-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_${{matrix.regression_func}}_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }} ${{env.WORK_PATH}}'
281+
rjob submit --metadata-name=${{matrix.regression_func}}-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_${{matrix.regression_func}}_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }} ${{env.WORK_PATH}}'
281282
282283
for i in {1..300}; do
283284
current_status=$(rjob get ${{matrix.regression_func}}-${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+')

0 commit comments

Comments
 (0)