Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 86 additions & 47 deletions opencompass/configs/datasets/SciReasoner/peer_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import PEER_postprocess, PEER_Evaluator, PEER_Dataset, PEER_postprocess_float_compare, \
PEER_postprocess_default
PEER_postprocess_default, PEERRuleEvaluator, peer_llm_judge_postprocess
from opencompass.evaluator import (
CascadeEvaluator,
GenericLLMEvaluator,
)

TASKS = [
'solubility',
Expand Down Expand Up @@ -37,63 +41,98 @@
# use default postprocess to remain the original output for LLM judgement.
# PEER_postprocess will be used in the evaluation stage to compare the output with the ground truth as a fast comparison.
eval_llm_cfg = dict(
evaluator=dict(type=PEER_Evaluator,
openai_key='EMPTY', gpt_model='gpt-4.1-mini'),
evaluator=dict(type=PEER_Evaluator),
pred_postprocessor=dict(type=PEER_postprocess_default),
dataset_postprocessor=dict(type=PEER_postprocess_default),
)

JUDGE_TEMPLATE = """
Please determine whether this answer is correct. Definition: 'Correct': The core conclusion of the model's answer (if any) is completely consistent with the reference answer (literal identity is not required). 'Incorrect': The core conclusion of the model's answer is consistent with the reference answer, or the core conclusion is not clearly expressed.
Reference answer: {reference}
Model answer: {prediction}
If correct, answer 'True'; if incorrect, answer 'False'. Please only answer 'True' or 'False'.
""".strip()



eval_stability_cfg = dict(
evaluator=dict(type=PEER_Evaluator, task='stability'),
pred_postprocessor=dict(type=PEER_postprocess_float_compare, compare_number=1),
dataset_postprocessor=dict(type=PEER_postprocess_float_compare, compare_number=1),
)


PEER_datasets = []
mini_PEER_datasets = []

for task in TASKS:
if task != 'stability':
PEER_datasets.append(
dict(
abbr=f'SciReasoner-PEER_{task}',
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_llm_cfg),
)
mini_PEER_datasets.append(
dict(
abbr=f'SciReasoner-PEER_{task}-mini',
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
mini_set=True,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_llm_cfg),
)
else:
PEER_datasets.append(
dict(
abbr=f'SciReasoner-PEER_{task}',
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_stability_cfg),
)
mini_PEER_datasets.append(
dict(
abbr=f'SciReasoner-PEER_{task}-mini',
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
mini_set=True,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_stability_cfg),
)

peer_llm_evaluator_cfg = dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs."
)
],
round=[
dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=peer_llm_judge_postprocess),
)

peer_rule_evaluator_cfg = dict(
type=PEERRuleEvaluator
)

cascade_evaluator = dict(
type=CascadeEvaluator,
rule_evaluator=peer_rule_evaluator_cfg,
llm_evaluator=peer_llm_evaluator_cfg,
parallel=False,
)

cascade_eval_llm_cfg = dict(
evaluator=cascade_evaluator,
pred_postprocessor=dict(type=PEER_postprocess_default),
dataset_postprocessor=dict(type=PEER_postprocess_default),
)




PEER_datasets.append(
dict(
abbr=f'SciReasoner-PEER_{task}',
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=cascade_eval_llm_cfg),
)
mini_PEER_datasets.append(
dict(
abbr=f'SciReasoner-PEER_{task}-mini',
type=PEER_Dataset,
path='opencompass/SciReasoner-PEER',
task=task,
mini_set=True,
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=cascade_eval_llm_cfg),
)

17 changes: 2 additions & 15 deletions opencompass/configs/datasets/SciReasoner/scireasoner_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,13 @@
from opencompass.configs.datasets.SciReasoner.UMG import UMG_Datasets, mini_UMG_Datasets

# full eval set
scireasoner_datasets_full = bio_instruction_datasets + composition_material_datasets + GUE_datasets + smol_datasets + \
scireasoner_full_datasets = bio_instruction_datasets + composition_material_datasets + GUE_datasets + smol_datasets + \
Retrosynthesis_uspto50k_datasets + LLM4Mat_datasets + modulus_material_datasets + \
mol_biotext_datasets + mol_mol_datasets + mol_protein_datasets + opi_datasets + PEER_datasets + \
uncond_material_datasets + uncond_RNA_datasets + uncond_protein_datasets + UMG_Datasets

# mini eval set
scireasoner_datasets_mini = mini_bio_instruction_datasets + mini_composition_material_datasets + mini_GUE_datasets + mini_smol_datasets + \
scireasoner_mini_datasets = mini_bio_instruction_datasets + mini_composition_material_datasets + mini_GUE_datasets + mini_smol_datasets + \
mini_Retrosynthesis_uspto50k_datasets + mini_LLM4Mat_datasets + mini_modulus_material_datasets + \
mini_mol_biotext_datasets + mini_mol_mol_datasets + mini_mol_protein_datasets + mini_opi_datasets + mini_PEER_datasets + \
mini_uncond_material_datasets + mini_uncond_RNA_datasets + mini_uncond_protein_datasets + mini_UMG_Datasets

# scireasoner_mini_datasets =\
# (
# # mini_bio_instruction_datasets +
# # mini_composition_material_datasets +
# # mini_modulus_material_datasets +
# # mini_GUE_datasets +
# # mini_LLM4Mat_datasets +
# # mini_mol_biotext_datasets + mini_mol_mol_datasets + mini_mol_protein_datasets + mini_opi_datasets + mini_Retrosynthesis_uspto50k_datasets + mini_smol_datasets
# # mini_UMG_Datasets + mini_uncond_material_datasets
# mini_uncond_RNA_datasets
# # mini_uncond_protein_datasets
# )
75 changes: 74 additions & 1 deletion opencompass/datasets/SciReasoner/PEER.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Union
from typing import Any, Dict, List, Optional, Union

import numpy as np
from datasets import Dataset, DatasetDict
Expand Down Expand Up @@ -469,3 +469,76 @@ def score(self, predictions, references):
}

return metrics


class PEERRuleEvaluator(BaseEvaluator):

def score(self,
predictions: List,
references: List,
test_set: Optional[List] = None) -> Dict:
if len(predictions) != len(references):
return {
'error': 'predictions and references have different length'
}

if not isinstance(predictions[0], list):
predictions = [[pred] for pred in predictions]
if not isinstance(references[0], list):
references = [[ref] for ref in references]

details = []
correct_count = 0

for i, (pred_list, ref_list) in enumerate(zip(predictions,
references)):
raw_pred = pred_list[0] if isinstance(pred_list,
list) else pred_list
raw_ref = ref_list[0] if isinstance(ref_list, list) else ref_list

clean_pred = PEER_postprocess(raw_pred).strip().lower()
clean_ref = PEER_postprocess(raw_ref).strip().lower()

is_correct = False

if clean_pred in ['yes', 'no'] and clean_ref in ['yes', 'no']:
if clean_pred == clean_ref:
is_correct = True

if is_correct:
correct_count += 1

details.append({
'pred': raw_pred,
'answer': raw_ref,
'clean_pred': clean_pred,
'clean_ref': clean_ref,
'correct': is_correct
})

return {
'accuracy': correct_count / len(predictions) * 100,
'details': details
}


def peer_llm_judge_postprocess(output: Dict, output_path: str) -> Dict:
new_details = []
for prediction_id, result in output.items():
content = result.get('prediction', '').strip().upper()

is_correct = False
if 'TRUE' in content:
is_correct = True
elif 'FALSE' in content:
is_correct = False
else:
is_correct = False

new_details.append({
'prediction': content,
'correct': is_correct,
'llm_judge': content
})

return {'details': new_details}
5 changes: 3 additions & 2 deletions opencompass/datasets/SciReasoner/uncond_RNA.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ def score(self, predictions, references):
mfe_values = self.parse_mfe(mfe_file)
avg_mfe = sum(mfe_values) / len(mfe_values) if mfe_values else None

rfam_cm = 'Rfam/Rfam.cm'
rfam_clanin = 'Rfam/Rfam.clanin'
cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
rfam_cm = os.path.join(cache_dir, 'Rfam/Rfam.cm')
rfam_clanin = os.path.join(cache_dir, 'Rfam/Rfam.clanin')
rfam_tblout = self.run_cmscan(fasta_path, tmpdir, rfam_cm,
rfam_clanin)
rfam_families = self.parse_unique_families(rfam_tblout)
Expand Down
Loading