diff --git a/.github/scripts/eval_regression_chat_obj_fullbench_v8.py b/.github/scripts/eval_regression_chat_obj_fullbench_v8.py index 53908f976..cb588d6cb 100644 --- a/.github/scripts/eval_regression_chat_obj_fullbench_v8.py +++ b/.github/scripts/eval_regression_chat_obj_fullbench_v8.py @@ -12,6 +12,10 @@ biodata_task_datasets # noqa: F401, E501 from opencompass.configs.datasets.CMPhysBench.cmphysbench_gen import \ cmphysbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFBench.IFBench_gen import \ + ifbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.livecodebench_pro.livecodebench_pro_gen import \ + lcb_pro_datasets # noqa: F401, E501 from opencompass.configs.datasets.MolInstructions_chem.mol_instructions_chem_gen import \ mol_gen_selfies_datasets # noqa: F401, E501 from opencompass.configs.datasets.openswi.openswi_gen import \ @@ -19,10 +23,7 @@ from ...rjob import eval, infer # noqa: F401, E501 -datasets = [ - *atlas_datasets, *biodata_task_datasets, *cmphysbench_datasets, - *mol_gen_selfies_datasets, *openswi_datasets -] +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) for d in datasets: if 'n' in d: diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 427dd98de..116485a93 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -352,6 +352,13 @@ qwen-3-8b-hf-fullbench: OpenSWI-shallow-1k_valid: 18.75 OpenSWI-deep-1k_score: 1069.07 OpenSWI-deep-1k_valid: 6.25 + IFBench_score: 25 + IFBench_Prompt-level-strict-accuracy: 25 + IFBench_Inst-level-strict-accuracy: 25 + IFBench_Prompt-level-loose-accuracy: 25 + IFBench_Inst-level-loose-accuracy: 25 + lcb_pro_accuracy: 25 + lcb_pro_pass@1: 25 chat_subjective: alignment_bench_v1_1_总分: 0.46 arenahard_score: 100 @@ -740,6 +747,13 @@ qwen-3-8b-fullbench: OpenSWI-shallow-1k_valid: 6.25 OpenSWI-deep-1k_score: 861.96 OpenSWI-deep-1k_valid: 0 + IFBench_score: 25 + IFBench_Prompt-level-strict-accuracy: 25 + IFBench_Inst-level-strict-accuracy: 25 + IFBench_Prompt-level-loose-accuracy: 25 + IFBench_Inst-level-loose-accuracy: 25 + lcb_pro_accuracy: 37.5 + lcb_pro_pass@1: 37.5 chat_longtext: babilong_qa1_256k_score: 0.00 LongBench_2wikimqa_score: 5.43 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 61c9f05bc..98a77aa04 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -42,8 +42,9 @@ env: CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3 REPORT_ROOT: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/eval_report/regression_test COMPASS_DATA_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache - HF_DATASETS_CACHE: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache + HF_DATASETS_CACHE: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache HF_HUB_CACHE: /mnt/shared-storage-user/large-model-center-share-weights/hf_hub + HF_DATASETS_DISABLE_LOCKFILES: 1 HF_ENDPOINT: https://hf-mirror.com TMPDIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/tmpdir PIP_CACHE_DIR: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/pip_cache @@ -222,7 +223,7 @@ jobs: . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} conda info --envs - rjob submit --metadata-name=cmd-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}' + rjob submit --metadata-name=cmd-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=2 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_cmd_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }}' for i in {1..300}; do current_status=$(rjob get cmd-${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+') @@ -277,7 +278,7 @@ jobs: . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} conda info --envs - rjob submit --metadata-name=${{matrix.regression_func}}-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_${{matrix.regression_func}}_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }} ${{env.WORK_PATH}}' + rjob submit --metadata-name=${{matrix.regression_func}}-${{ env.JOB_NAME }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=32 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu/xpuyu:torch-2.6.0-45d96d5f-0607 --env=COMPASS_DATA_CACHE=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/compass_data_cache --env=TIKTOKEN_CACHE_DIR=/mnt/shared-storage-user/auto-eval-pipeline/opencompass/llmeval/share_tiktoken --env=HF_ENDPOINT=https://hf-mirror.com --env=HF_DATASETS_CACHE=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/hf_cache --env=HF_HUB_CACHE=/mnt/shared-storage-user/large-model-center-share-weights/hf_hub --env=CUDA_MODULE_LOADING=EAGER --env=HF_DATASETS_OFFLINE=1 --env=TRANSFORMERS_OFFLINE=1 --env=HF_EVALUATE_OFFLINE=1 --env=HF_HUB_OFFLINE=1 --env=VLLM_USE_MODELSCOPE=false --env=VLLM_WORKER_MULTIPROC_METHOD=spawn --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs1/large-model-center-share-weights:/mnt/shared-storage-user/large-model-center-share-weights --host-network=True -- bash -exc '/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/daily_${{matrix.regression_func}}_test.sh ${{env.REPORT_ROOT}}/${{ github.run_id }} ${{env.WORK_PATH}}' for i in {1..300}; do current_status=$(rjob get ${{matrix.regression_func}}-${{ env.JOB_NAME }} | grep -oP 'rjob [^:]+: \K[^ ]+')