Skip to content

Commit 989b373

Browse files
jnanliuliushz
authored andcommitted
[Feature] Support Omni-Math (open-compass#1837)
* support omni-math * update config * upload README * Delete opencompass/configs/datasets/omni_math/__init__.py --------- Co-authored-by: liushz <qq1791167085@163.com>
1 parent 5379086 commit 989b373

File tree

5 files changed

+220
-3
lines changed

5 files changed

+220
-3
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Omni-Math
2+
3+
[Omni-Math](https://huggingface.co/datasets/KbsdJames/Omni-MATH) contains 4428 competition-level problems. These problems are meticulously categorized into 33 (and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced analysis of model performance across various mathematical disciplines and levels of complexity.
4+
5+
* Project Page: https://omni-math.github.io/
6+
* Github Repo: https://github.com/KbsdJames/Omni-MATH
7+
* Omni-Judge (opensource evaluator of this dataset): https://huggingface.co/KbsdJames/Omni-Judge
8+
9+
## Omni-Judge
10+
11+
> Omni-Judge is an open-source mathematical evaluation model designed to assess whether a solution generated by a model is correct given a problem and a standard answer.
12+
13+
You should deploy the omni-judge server like:
14+
```bash
15+
set -x
16+
17+
lmdeploy serve api_server KbsdJames/Omni-Judge --server-port 8000 \
18+
--tp 1 \
19+
--cache-max-entry-count 0.9 \
20+
--log-level INFO
21+
```
22+
23+
and set the server url in opencompass config file:
24+
25+
```python
26+
from mmengine.config import read_base
27+
28+
with read_base():
29+
from opencompass.configs.datasets.omni_math.omni_math_gen import omni_math_datasets
30+
31+
32+
omni_math_dataset = omni_math_datasets[0]
33+
omni_math_dataset['eval_cfg']['evaluator'].update(
34+
url=['http://172.30.8.45:8000',
35+
'http://172.30.16.113:8000'],
36+
)
37+
```
38+
39+
## Performance
40+
41+
| llama-3_1-8b-instruct | qwen-2_5-7b-instruct | InternLM3-8b-Instruct |
42+
| -- | -- | -- |
43+
| 15.18 | 29.97 | 32.75 |
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from mmengine.config import read_base
2+
3+
with read_base():
4+
from .omni_math_gen_18cc08 import omni_math_datasets # noqa: F401, F403
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from opencompass.openicl.icl_prompt_template import PromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
5+
from opencompass.datasets.omni_math import OmniMathDataset, OmniMathEvaluator
6+
7+
8+
reader_cfg = dict(
9+
input_columns=['problem'],
10+
output_column='answer'
11+
)
12+
13+
infer_cfg = dict(
14+
prompt_template=dict(
15+
type=PromptTemplate,
16+
template=dict(
17+
round=[
18+
dict(role='HUMAN', prompt='please answer the following mathematical question, put your final answer in \\boxed{}.\n\n{problem}'),
19+
]
20+
)
21+
),
22+
retriever=dict(type=ZeroRetriever),
23+
inferencer=dict(
24+
type=GenInferencer,
25+
max_out_len=2048,
26+
temperature=0.0
27+
)
28+
)
29+
30+
eval_cfg = dict(
31+
evaluator=dict(
32+
type=OmniMathEvaluator,
33+
url=[]
34+
)
35+
)
36+
37+
omni_math_datasets = [
38+
dict(
39+
type=OmniMathDataset,
40+
abbr='OmniMath',
41+
reader_cfg=reader_cfg,
42+
infer_cfg=infer_cfg,
43+
eval_cfg=eval_cfg
44+
)
45+
]

opencompass/datasets/omni_math.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import concurrent.futures
2+
from typing import List
3+
4+
import numpy as np
5+
from datasets import load_dataset
6+
from transformers import AutoTokenizer
7+
8+
from opencompass.models.turbomind_api import TurboMindAPIModel
9+
from opencompass.openicl.icl_evaluator import BaseEvaluator
10+
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS
11+
12+
from .base import BaseDataset
13+
14+
15+
@LOAD_DATASET.register_module()
16+
class OmniMathDataset(BaseDataset):
17+
18+
@staticmethod
19+
def load():
20+
dataset = load_dataset('KbsdJames/Omni-MATH')['test']
21+
return dataset
22+
23+
24+
@ICL_EVALUATORS.register_module()
25+
class OmniMathEvaluator(BaseEvaluator):
26+
api_meta_template = dict(round=[
27+
dict(role='HUMAN', api_role='HUMAN'),
28+
dict(role='BOT', api_role='BOT', generate=True),
29+
])
30+
31+
def __init__(self, url):
32+
if isinstance(url, str):
33+
url = [url]
34+
35+
self.model = [
36+
MODELS.build(
37+
dict(
38+
type=TurboMindAPIModel,
39+
model_name='KbsdJames/Omni-Judge',
40+
api_addr=url,
41+
meta_template=self.api_meta_template,
42+
temperature=0.0,
43+
max_seq_len=8192,
44+
)) for url in url
45+
]
46+
self.tokenizer = AutoTokenizer.from_pretrained('KbsdJames/Omni-Judge',
47+
trust_remote_code=True)
48+
49+
def batch_infer(self, models: List[TurboMindAPIModel],
50+
inputs: List[str]) -> List[str]:
51+
batch_num = len(models)
52+
batch_size = (len(inputs) + batch_num - 1) // batch_num
53+
result_responses = []
54+
55+
with concurrent.futures.ThreadPoolExecutor(
56+
max_workers=batch_num) as executor:
57+
futures = [
58+
executor.submit(models[i].generate,
59+
inputs[i * batch_size:(i + 1) * batch_size])
60+
for i in range(batch_num)
61+
]
62+
for response in executor.map(lambda f: f.result(), futures):
63+
result_responses.extend(response)
64+
65+
return result_responses
66+
67+
def parse_response(self, response):
68+
response = '## Student Final Answer\n' + response.strip()
69+
70+
parts = response.split('## ')
71+
info = {}
72+
73+
for part in parts[1:]:
74+
lines = part.strip().split('\n')
75+
title = lines[0].strip()
76+
content = '\n'.join(lines[1:]).strip()
77+
78+
if title == 'Justification':
79+
info[title] = content
80+
else:
81+
info[title] = lines[1].strip() if len(lines) > 1 else ''
82+
83+
if info == {}:
84+
return False
85+
try:
86+
correctness = info['Equivalence Judgement']
87+
if correctness == 'TRUE':
88+
return True
89+
else:
90+
return False
91+
except Exception as e:
92+
print(e)
93+
return False
94+
95+
def score(self, predictions, references, origin_prompt, test_set):
96+
questions = [d['problem'] for d in test_set]
97+
98+
contexts = []
99+
for question, reference, candidate in zip(questions, references,
100+
predictions):
101+
context = self.tokenizer.get_context(question, reference,
102+
candidate)
103+
contexts.append(context)
104+
105+
responses = self.batch_infer(self.model, contexts)
106+
labels = list(map(self.parse_response, responses))
107+
108+
details = []
109+
for question, reference, candidate, response, label in zip(
110+
questions, references, predictions, responses, labels):
111+
details.append({
112+
'question': question,
113+
'reference': reference,
114+
'candidate': candidate,
115+
'response': response,
116+
'label': label
117+
})
118+
return {'details': details, 'accuracy': np.mean(labels) * 100}

opencompass/models/turbomind_api.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,25 +39,28 @@ class TurboMindAPIModel(BaseModel):
3939
is_api: bool = True
4040

4141
def __init__(self,
42+
model_name: str = None,
4243
api_addr: str = 'http://0.0.0.0:23333',
4344
api_key: str | None = None,
4445
max_seq_len: int = 2048,
4546
meta_template: Optional[Dict] = None,
4647
end_str: Optional[str] = None,
48+
temperature: float = None,
4749
**kwargs):
4850
super().__init__(path='',
4951
max_seq_len=max_seq_len,
5052
meta_template=meta_template)
5153
from lmdeploy.serve.openai.api_client import APIClient
5254
self.chatbot = APIClient(api_addr, api_key)
53-
self.model_name = self.chatbot.available_models[0]
55+
self.model_name = model_name
5456
self.logger = get_logger()
5557
self.template_parser = LMTemplateParser(meta_template)
5658
self.eos_token_id = None
5759
if meta_template and 'eos_token_id' in meta_template:
5860
self.eos_token_id = meta_template['eos_token_id']
5961
self.api_addr = api_addr
6062
self.end_str = end_str
63+
self.temperature = temperature
6164

6265
def generate(
6366
self,
@@ -84,6 +87,9 @@ def generate(
8487
List[str]: A list of generated strings.
8588
"""
8689

90+
if self.temperature is not None:
91+
temperature = self.temperature
92+
8793
with ThreadPoolExecutor() as executor:
8894
results = list(
8995
executor.map(self._generate, inputs,
@@ -125,13 +131,14 @@ def _generate(self, prompt: PromptType, max_out_len: int,
125131

126132
response = ''
127133
for output in self.chatbot.completions_v1(
128-
session_id=threading.currentThread().ident,
129134
prompt=prompt,
130135
model=self.model_name,
131136
max_tokens=max_out_len,
132137
temperature=temperature,
133138
top_p=0.8,
134-
top_k=1):
139+
top_k=50,
140+
session_id=threading.currentThread().ident,
141+
):
135142
response += output['choices'][0]['text']
136143
response = valid_str(response)
137144
if end_str:

0 commit comments

Comments
 (0)