1+ # flake8: noqa
2+
3+ from mmengine .config import read_base
4+
5+ from opencompass .partitioners import NaivePartitioner , NumWorkerPartitioner
6+ from opencompass .runners import LocalRunner
7+ from opencompass .tasks import OpenICLEvalTask , OpenICLInferTask
8+ from copy import deepcopy
9+ from opencompass .utils .text_postprocessors import extract_non_reasoning_content
10+ from opencompass .models import OpenAISDKStreaming
11+
12+ #######################################################################
13+ # PART 0 Essential Configs #
14+ #######################################################################
15+ with read_base ():
16+ # Datasets
17+
18+ from opencompass .configs .datasets .mmlu_pro .mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import (
19+ mmlu_pro_datasets ,
20+ )
21+ from opencompass .configs .datasets .gpqa .gpqa_cascade_eval_gen_772ea0 import (
22+ gpqa_datasets ,
23+ )
24+ from opencompass .configs .datasets .aime2025 .aime2025_cascade_eval_gen_5e9f4f import (
25+ aime2025_datasets ,
26+ )
27+ from opencompass .configs .chatml_datasets .IMO_Bench_AnswerBench .IMO_Bench_AnswerBench_gen import (
28+ datasets as IMO_Bench_AnswerBench_chatml
29+ )
30+ from opencompass .configs .datasets .IFBench .IFBench_gen import (
31+ ifbench_datasets ,
32+ )
33+ from opencompass .configs .datasets .livecodebench .livecodebench_gen_a4f90b import (
34+ LCBCodeGeneration_dataset ,
35+ )
36+ from opencompass .configs .datasets .SmolInstruct .smolinstruct_0shot_instruct_gen import (
37+ smolinstruct_datasets_0shot_instruct as smolinstruct_datasets ,
38+ )
39+ from opencompass .configs .datasets .matbench .matbench_llm_judge_gen_0e9276 import (
40+ matbench_datasets ,
41+ )
42+ from opencompass .configs .datasets .biodata .biodata_task_gen import (
43+ biodata_task_datasets
44+ )
45+ from opencompass .configs .datasets .MolInstructions_chem .mol_instructions_chem_gen import (
46+ mol_gen_selfies_datasets
47+ )
48+
49+ # Summary Groups
50+ from opencompass .configs .summarizers .groups .mmlu_pro import \
51+ mmlu_pro_summary_groups
52+ from opencompass .configs .summarizers .groups .biodata import (
53+ biodata_summary_groups ,
54+ )
55+
56+ LCBCodeGeneration_v6_datasets = deepcopy (LCBCodeGeneration_dataset )
57+ LCBCodeGeneration_v6_datasets ['abbr' ] = 'lcb_code_generation_v6'
58+ LCBCodeGeneration_v6_datasets ['release_version' ] = 'v6'
59+ LCBCodeGeneration_v6_datasets ['eval_cfg' ]['evaluator' ][
60+ 'release_version'
61+ ] = 'v6'
62+ LCBCodeGeneration_v6_datasets = [LCBCodeGeneration_v6_datasets ]
63+
64+ #######################################################################
65+ # PART 1 Datasets List #
66+ #######################################################################
67+ # datasets list for evaluation
68+
69+ repeated_info = [
70+ (gpqa_datasets , 8 ),
71+ (aime2025_datasets , 32 ),
72+ ]
73+
74+ for datasets_ , num in repeated_info :
75+ for dataset_ in datasets_ :
76+ dataset_ ['n' ] = num
77+ dataset_ ['k' ] = num
78+
79+ datasets = sum (
80+ (v for k , v in locals ().items () if k .endswith ('_datasets' )),
81+ [],
82+ )
83+
84+ chatml_datasets = sum (
85+ (v for k , v in locals ().items () if k .endswith ('_chatml' )),
86+ [],
87+ )
88+
89+ # LLM judge config: using LLM to evaluate predictions
90+ judge_cfg = dict (
91+ abbr = 'YOUR_JUDGE_MODEL' ,
92+ type = OpenAISDKStreaming ,
93+ path = 'YOUR_JUDGE_MODEL' ,
94+ key = 'YOUR_JUDGE_KEY' ,
95+ openai_api_base = 'YOUR_JUDGE_URL' ,
96+ mode = 'mid' ,
97+ meta_template = dict (
98+ round = [
99+ dict (role = 'HUMAN' , api_role = 'HUMAN' ),
100+ dict (role = 'BOT' , api_role = 'BOT' , generate = True ),
101+ ]
102+ ),
103+ query_per_second = 16 ,
104+ batch_size = 64 ,
105+ temperature = 0.001 ,
106+ max_out_len = 8192 ,
107+ max_seq_len = 32768 ,
108+ )
109+
110+ for item in datasets :
111+ if 'judge_cfg' in item ['eval_cfg' ]['evaluator' ]:
112+ item ['eval_cfg' ]['evaluator' ]['judge_cfg' ] = judge_cfg
113+ if 'llm_evaluator' in item ['eval_cfg' ]['evaluator' ].keys () and 'judge_cfg' in item ['eval_cfg' ]['evaluator' ]['llm_evaluator' ]:
114+ item ['eval_cfg' ]['evaluator' ]['llm_evaluator' ]['judge_cfg' ] = judge_cfg
115+
116+ for item in chatml_datasets :
117+ if item ['evaluator' ]['type' ] == 'llm_evaluator' :
118+ item ['evaluator' ]['judge_cfg' ] = judge_cfg
119+ if item ['evaluator' ]['type' ] == 'cascade_evaluator' :
120+ item ['evaluator' ]['llm_evaluator' ]['judge_cfg' ] = judge_cfg
121+
122+
123+ #######################################################################
124+ # PART 2 Datset Summarizer #
125+ #######################################################################
126+
127+ summary_groups = sum (
128+ [v for k , v in locals ().items () if k .endswith ('_summary_groups' )], []
129+ )
130+
131+ summarizer = dict (
132+ dataset_abbrs = [
133+ ['mmlu_pro' , 'accuracy' ],
134+ ['IFBench' , 'score' ],
135+ ['GPQA_diamond' , 'accuracy (8 runs average)' ],
136+ ['aime2025' , 'accuracy (32 runs average)' ],
137+ ['lcb_code_generation_v6' , 'pass@1' ],
138+ ['bio_data' , 'naive_average' ],
139+ ['IMO-Bench-AnswerBench' , 'accuracy' ],
140+ '' ,
141+ 'Mol_Instruct' ,
142+ ['FS-selfies' , 'score' ],
143+ ['MC-selfies' , 'score' ],
144+ ['MG-selfies' , 'score' ],
145+ ['PP-selfies' , 'score' ],
146+ ['RP-selfies' , 'score' ],
147+ ['RS-selfies' , 'score' ],
148+ '' ,
149+ 'SmolInstruct' ,
150+ ['NC-I2F-0shot-instruct' , 'score' ],
151+ ['NC-I2S-0shot-instruct' , 'score' ],
152+ ['NC-S2F-0shot-instruct' , 'score' ],
153+ ['NC-S2I-0shot-instruct' , 'score' ],
154+ ['PP-ESOL-0shot-instruct' , 'score' ],
155+ ['PP-Lipo-0shot-instruct' , 'score' ],
156+ ['PP-BBBP-0shot-instruct' , 'accuracy' ],
157+ ['PP-ClinTox-0shot-instruct' , 'accuracy' ],
158+ ['PP-HIV-0shot-instruct' , 'accuracy' ],
159+ ['PP-SIDER-0shot-instruct' , 'accuracy' ],
160+ ['MC-0shot-instruct' , 'score' ],
161+ ['MG-0shot-instruct' , 'score' ],
162+ ['FS-0shot-instruct' , 'score' ],
163+ ['RS-0shot-instruct' , 'score' ],
164+ '' ,
165+ ['matbench_expt_gap' , 'mae' ],
166+ ['matbench_steels' , 'mae' ],
167+ ['matbench_expt_is_metal' , 'accuracy' ],
168+ ['matbench_glass' , 'accuracy' ],
169+ ],
170+ summary_groups = summary_groups ,
171+ )
172+
173+
174+ #######################################################################
175+ # PART 3 Models #
176+ #######################################################################
177+
178+ api_meta_template = dict (
179+ round = [
180+ dict (role = 'SYSTEM' , api_role = 'SYSTEM' ), # System prompt is only needed when evaluating Bio_data and Mol_instructions
181+ dict (role = 'HUMAN' , api_role = 'HUMAN' ),
182+ dict (role = 'BOT' , api_role = 'BOT' , generate = True ),
183+ ]
184+ )
185+
186+ models = [
187+ dict (
188+ abbr = 'intern-s1-pro' ,
189+ type = OpenAISDKStreaming ,
190+ path = 'intern-s1-pro' ,
191+ key = 'YOUR_API_KEY' ,
192+ openai_api_base = 'YOUR_API_BASE' ,
193+ meta_template = api_meta_template ,
194+ query_per_second = 16 ,
195+ batch_size = 8 ,
196+ temperature = 0.8 ,
197+ retry = 10 ,
198+ max_out_len = 65536 ,
199+ max_seq_len = 65536 ,
200+ extra_body = {
201+ 'chat_template_kwargs' : {'enable_thinking' : True } # Disable thinking when evaluating scientific benchmarks
202+ },
203+ pred_postprocessor = dict (
204+ type = extract_non_reasoning_content ,
205+ ),
206+ ),
207+ ]
208+
209+ #######################################################################
210+ # PART 4 Inference/Evaluation Configuaration #
211+ #######################################################################
212+
213+ # infer with local runner
214+ infer = dict (
215+ partitioner = dict (type = NumWorkerPartitioner , num_worker = 8 ),
216+ runner = dict (
217+ type = LocalRunner ,
218+ max_num_workers = 16 ,
219+ task = dict (type = OpenICLInferTask ),
220+ ),
221+ )
222+
223+ # eval with local runner
224+ eval = dict (
225+ partitioner = dict (type = NaivePartitioner , n = 10 ),
226+ runner = dict (
227+ type = LocalRunner ,
228+ max_num_workers = 16 ,
229+ task = dict (type = OpenICLEvalTask )
230+ ),
231+ )
232+
233+ #######################################################################
234+ # PART 5 Utils Configuaration #
235+ #######################################################################
236+
237+ work_dir = './outputs/oc_intern_s1_pro_eval'
0 commit comments