|
| 1 | +# flake8: noqa |
| 2 | + |
| 3 | +from mmengine.config import read_base |
| 4 | + |
| 5 | +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner |
| 6 | +from opencompass.runners import LocalRunner, VOLCRunner |
| 7 | +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask |
| 8 | + |
| 9 | +####################################################################### |
| 10 | +# PART 0 Essential Configs # |
| 11 | +####################################################################### |
| 12 | +with read_base(): |
| 13 | + # Datasets Part |
| 14 | + # Knowledge |
| 15 | + # Math |
| 16 | + from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \ |
| 17 | + aime2024_datasets |
| 18 | + from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \ |
| 19 | + bbh_datasets |
| 20 | + # General Reasoning |
| 21 | + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ |
| 22 | + gpqa_datasets |
| 23 | + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \ |
| 24 | + humaneval_datasets |
| 25 | + # Instruction Following |
| 26 | + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \ |
| 27 | + ifeval_datasets |
| 28 | + from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \ |
| 29 | + LCBCodeGeneration_dataset |
| 30 | + from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \ |
| 31 | + math_datasets |
| 32 | + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ |
| 33 | + mmlu_pro_datasets |
| 34 | + # Model List |
| 35 | + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ |
| 36 | + models as hf_internlm2_5_7b_chat_model |
| 37 | + # Summary Groups |
| 38 | + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups |
| 39 | + from opencompass.configs.summarizers.groups.mmlu_pro import \ |
| 40 | + mmlu_pro_summary_groups |
| 41 | + |
| 42 | +####################################################################### |
| 43 | +# PART 1 Datasets List # |
| 44 | +####################################################################### |
| 45 | +# datasets list for evaluation |
| 46 | +# Only take LCB generation for evaluation |
| 47 | +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), |
| 48 | + []) + [LCBCodeGeneration_dataset] |
| 49 | + |
| 50 | +# LLM judge config: using LLM to evaluate predictions |
| 51 | +judge_cfg = dict() |
| 52 | +for dataset in datasets: |
| 53 | + dataset['infer_cfg']['inferencer']['max_out_len'] = 32768 |
| 54 | + if 'judge_cfg' in dataset['eval_cfg']['evaluator']: |
| 55 | + dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg |
| 56 | + |
| 57 | + |
| 58 | +####################################################################### |
| 59 | +# PART 2 Datset Summarizer # |
| 60 | +####################################################################### |
| 61 | + |
| 62 | +core_summary_groups = [ |
| 63 | + { |
| 64 | + 'name': |
| 65 | + 'core_average', |
| 66 | + 'subsets': [ |
| 67 | + ['IFEval', 'Prompt-level-strict-accuracy'], |
| 68 | + ['bbh', 'naive_average'], |
| 69 | + ['math_prm800k_500', 'accuracy'], |
| 70 | + ['aime2024', 'accuracy'], |
| 71 | + ['GPQA_diamond', 'accuracy'], |
| 72 | + ['mmlu_pro', 'naive_average'], |
| 73 | + ['openai_humaneval', 'humaneval_pass@1'], |
| 74 | + ['lcb_code_generation', 'pass@1'], |
| 75 | + ], |
| 76 | + }, |
| 77 | +] |
| 78 | + |
| 79 | +summarizer = dict( |
| 80 | + dataset_abbrs=[ |
| 81 | + ['core_average', 'naive_average'], |
| 82 | + '', |
| 83 | + 'Instruction Following', |
| 84 | + ['IFEval', 'Prompt-level-strict-accuracy'], |
| 85 | + '', |
| 86 | + 'General Reasoning', |
| 87 | + ['bbh', 'naive_average'], |
| 88 | + ['GPQA_diamond', 'accuracy'], |
| 89 | + '', |
| 90 | + 'Math Calculation', |
| 91 | + ['math_prm800k_500', 'accuracy'], |
| 92 | + ['aime2024', 'accuracy'], |
| 93 | + '', |
| 94 | + 'Knowledge', |
| 95 | + ['mmlu_pro', 'naive_average'], |
| 96 | + '', |
| 97 | + 'Code', |
| 98 | + ['openai_humaneval', 'humaneval_pass@1'], |
| 99 | + ['lcb_code_generation', 'pass@1'], |
| 100 | + ], |
| 101 | + summary_groups=sum( |
| 102 | + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), |
| 103 | +) |
| 104 | + |
| 105 | +####################################################################### |
| 106 | +# PART 3 Models List # |
| 107 | +####################################################################### |
| 108 | + |
| 109 | +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) |
| 110 | + |
| 111 | +####################################################################### |
| 112 | +# PART 4 Inference/Evaluation Configuaration # |
| 113 | +####################################################################### |
| 114 | + |
| 115 | +# Local Runner |
| 116 | +infer = dict( |
| 117 | + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), |
| 118 | + runner=dict( |
| 119 | + type=LocalRunner, |
| 120 | + max_num_workers=16, |
| 121 | + retry=0, # Modify if needed |
| 122 | + task=dict(type=OpenICLInferTask), |
| 123 | + ), |
| 124 | +) |
| 125 | + |
| 126 | +# eval with local runner |
| 127 | +eval = dict( |
| 128 | + partitioner=dict(type=NaivePartitioner, n=10), |
| 129 | + runner=dict(type=LocalRunner, |
| 130 | + max_num_workers=16, |
| 131 | + task=dict(type=OpenICLEvalTask)), |
| 132 | +) |
| 133 | + |
| 134 | +####################################################################### |
| 135 | +# PART 5 Utils Configuaration # |
| 136 | +####################################################################### |
| 137 | +work_dir = './outputs/oc_academic_202502' |
0 commit comments