From d3c7b251b40dd57c61fb8c798b0af881d875faee Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 5 Jan 2026 20:07:55 +0800 Subject: [PATCH 1/6] first version of new eval --- .../unified_bench_eval_type1.jsonl | 6 + dataflow/operators/core_text/__init__.py | 1 + .../eval/unified_bench_dataset_evaluator.py | 1215 +++++++++++++++++ .../generate/bench_answer_generator.py | 250 ++++ .../unified_bench_eval_pipeline.py | 66 + 5 files changed, 1538 insertions(+) create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type1.jsonl create mode 100644 dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py create mode 100644 dataflow/operators/core_text/generate/bench_answer_generator.py create mode 100644 dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py diff --git a/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl new file mode 100644 index 00000000..a76d9002 --- /dev/null +++ b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl @@ -0,0 +1,6 @@ +{"text": "This is a simple test sentence to measure perplexity for the first unified bench type."} +{"text": "The capital of France is Paris."} +{"text": "Please evaluate the language model perplexity on this short example."} +{"text": "Machine learning enables computers to learn patterns from data."} +{"text": "Perplexity is a common metric for evaluating language models on text scoring tasks."} +{"text": "666233gigity"} \ No newline at end of file diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py index 933fb105..2ed5a57e 100644 --- a/dataflow/operators/core_text/__init__.py +++ b/dataflow/operators/core_text/__init__.py @@ -8,6 +8,7 @@ from .generate.text2multihopqa_generator import Text2MultiHopQAGenerator from .generate.embedding_generator import EmbeddingGenerator from .generate.retrieval_generator import RetrievalGenerator + from .generate.bench_answer_generator import BenchAnswerGenerator from .eval.bench_dataset_evaluator import BenchDatasetEvaluator from .eval.bench_dataset_evaluator_question import BenchDatasetEvaluatorQuestion from .eval.text2qa_sample_evaluator import Text2QASampleEvaluator diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py new file mode 100644 index 00000000..6a09990b --- /dev/null +++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py @@ -0,0 +1,1215 @@ +from __future__ import annotations + +import json +import os +import re +import time +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union +import numpy as np +import pandas as pd +from math_verify import parse, verify +import torch +import torch.nn.functional as F +from transformers import AutoModelForCausalLM, AutoTokenizer + +from dataflow import get_logger +from dataflow.core import OperatorABC +from dataflow.core.prompt import DIYPromptABC +from dataflow.core.prompt import prompt_restrict +from dataflow.prompts.model_evaluation.general import AnswerJudgePrompt +from dataflow.core import LLMServingABC +from dataflow.utils.registry import OPERATOR_REGISTRY +from dataflow.utils.storage import DataFlowStorage +from dataflow.utils.reasoning.AnswerExtraction import StringCleaner, UnitTextManager, AnswerExtractor + + +@prompt_restrict( + AnswerJudgePrompt +) + +@OPERATOR_REGISTRY.register() +class UnifiedBenchDatasetEvaluator(OperatorABC): + """ + 统一 Bench 评测算子:支持 6 类 keys-type + metric。 + + 评测类型 (bench_dataflow_eval_type): (详见doc) + - key1_text_score + - key2_qa + - key2_q_ma + - key3_q_choices_a + - key3_q_choices_as + - key3_q_a_rejected + + 核心思想: + 只需要传 bench_dataflow_eval_type + metric_type + keys_map + (可选) context_key + - evaluator 内部负责: + 1) 读取 dataframe + 2) 取 keys + 3) 组装 prompt(用 prompt_template 或默认模板) + 4) 计算 metric + 5) 写回结果列 + 统计落盘 + """ + + # ----------------------------- + # 构造 + # ----------------------------- + def __init__( + self, + eval_result_path: Optional[str] = None, + eval_type: Literal[ + "key1_text_score", + "key2_qa", + "key2_q_ma", + "key3_q_choices_a", + "key3_q_choices_as", + "key3_q_a_rejected", + ] = "key2_qa", + llm_serving: Optional[LLMServingABC] = None, + prompt_template: Union[AnswerJudgePrompt, DIYPromptABC] = AnswerJudgePrompt, + system_prompt: str = "You are a helpful assistant specialized in evaluating answer correctness.", + metric_type: Optional[str] = None, + use_semantic_judge: bool = False, + ): + if eval_result_path is None: + timestamp = int(time.time()) + eval_result_path = f"result_bencheval/UnifiedBenchDatasetEvaluator_result_{timestamp}.json" + + self.eval_result_path = eval_result_path + self.eval_type = eval_type + self.llm_serving = llm_serving + self.prompt_template = prompt_template + self.system_prompt = system_prompt + self.metric_type = metric_type + self.use_semantic_judge = use_semantic_judge + + unit_manager = UnitTextManager() + string_cleaner = StringCleaner(unit_manager) + self.answer_extractor = AnswerExtractor(string_cleaner) + + self.logger = get_logger() + self.empty_responses_count = 0 + + # ----------------------------- + # 工具函数:列检查 + # ----------------------------- + def _check_columns(self, dataframe: pd.DataFrame, cols: List[str]) -> bool: + ok = True + for c in cols: + if c not in dataframe.columns: + self.logger.error(f"Required column '{c}' not found in dataframe") + ok = False + return ok + + # ----------------------------- + # 工具函数:context 统一拼接 + # ----------------------------- + def _normalize_context(self, ctx: Any) -> Optional[str]: + if ctx is None: + return None + if isinstance(ctx, float) and np.isnan(ctx): + return None + if isinstance(ctx, list): + parts = [] + for x in ctx: + if x is None: + continue + s = str(x).strip() + if s: + parts.append(s) + return "\n".join(parts) if parts else None + s = str(ctx).strip() + return s if s else None + + # ----------------------------- + # 工具函数:默认 prompt(当 prompt_template 不存在或 build_prompt 不可用) + # ----------------------------- + def _default_prompt( + self, + *, + question: Optional[str] = None, + context: Optional[str] = None, + text: Optional[str] = None, + choices: Optional[List[str]] = None, + task: str = "", + ) -> str: + if task == "text_score": + return (text or "").strip() + + ctx_block = f"Context:\n{context}\n\n" if context else "" + q_block = f"Question:\n{(question or '').strip()}\n\n" + + if choices is not None: + # 标准化成 A./B./C. 格式,便于模板替换 & 也便于 fallback 解析 + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + formatted = [] + for i, ch in enumerate(choices): + tag = letters[i] if i < len(letters) else str(i) + formatted.append(f"{tag}. {str(ch)}") + choices_block = "Choices:\n" + "\n".join(formatted) + "\n\n" + return f"{ctx_block}{q_block}{choices_block}Answer:" + else: + return f"{ctx_block}{q_block}Answer:" + + def _build_prompt( + self, + *, + question: Optional[str] = None, + context: Optional[str] = None, + text: Optional[str] = None, + choices: Optional[List[str]] = None, + task: str = "", + ) -> str: + # 兼容你的 prompt_template(通常有 build_prompt) + if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"): + try: + # 给模板更丰富的变量,模板不用可以忽略 + choices_text = None + if choices is not None: + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + formatted = [] + for i, ch in enumerate(choices): + tag = letters[i] if i < len(letters) else str(i) + formatted.append(f"{tag}. {str(ch)}") + choices_text = "\n".join(formatted) + + return self.prompt_template.build_prompt( + question=question, + context=context, + text=text, + choices=choices, + choices_text=choices_text, + task=task, + ) + except Exception as e: + self.logger.error(f"prompt_template.build_prompt failed, fallback to default. err={e}") + + return self._default_prompt(question=question, context=context, text=text, choices=choices, task=task) + + # ----------------------------- + # math_verify compare + # ----------------------------- + def _math_verify_compare(self, answer: Any, ground_truth: Any) -> bool: + try: + return verify(parse(str(ground_truth)), parse(str(answer))) + except Exception: + try: + return verify(parse(ground_truth), parse(answer)) + except Exception: + return False + + # ----------------------------- + # 多参考答案:把 targets 解析成 List[str] + # ----------------------------- + def _normalize_targets(self, targets: Any) -> List[str]: + if targets is None: + return [] + if isinstance(targets, float) and np.isnan(targets): + return [] + if isinstance(targets, list): + return [str(x) for x in targets if str(x).strip()] + + s = str(targets).strip() + if not s: + return [] + + # 尝试 json list + if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")): + try: + obj = json.loads(s) + if isinstance(obj, list): + return [str(x) for x in obj if str(x).strip()] + except Exception: + pass + + # 常见分隔 + if "||" in s: + parts = [p.strip() for p in s.split("||")] + elif "|" in s: + parts = [p.strip() for p in s.split("|")] + elif ";" in s: + parts = [p.strip() for p in s.split(";")] + else: + parts = [s] + return [p for p in parts if p] + + # ----------------------------- + # choice 解析(fallback 用) + # ----------------------------- + def _parse_choice_from_text(self, text: str, num_choices: int) -> Optional[int]: + if text is None: + return None + t = str(text).strip() + if not t: + return None + + # 先找 A/B/C... + m = re.search(r"\b([A-Za-z])\b", t) + if m: + idx = ord(m.group(1).upper()) - ord("A") + if 0 <= idx < num_choices: + return idx + + # 再找数字(1-based 或 0-based 都兼容) + m = re.search(r"\b(\d+)\b", t) + if m: + val = int(m.group(1)) + if 0 <= val < num_choices: + return val + if 1 <= val <= num_choices: + return val - 1 + + return None + + def _parse_multiselect_set(self, text: str, num_choices: int) -> Optional[set]: + if text is None: + return None + s = str(text).strip() + if not s: + return None + + # json list + if s.startswith("[") and s.endswith("]"): + try: + obj = json.loads(s) + if isinstance(obj, list): + res = set() + for x in obj: + if isinstance(x, str): + x = x.strip() + if len(x) == 1 and x.isalpha(): + idx = ord(x.upper()) - ord("A") + if 0 <= idx < num_choices: + res.add(idx) + elif x.isdigit(): + v = int(x) + if 0 <= v < num_choices: + res.add(v) + elif 1 <= v <= num_choices: + res.add(v - 1) + elif isinstance(x, int): + if 0 <= x < num_choices: + res.add(x) + elif 1 <= x <= num_choices: + res.add(x - 1) + return res + except Exception: + pass + + # 字母集合:如 "A,C,D" / "B D" + letters = re.findall(r"\b([A-Za-z])\b", s) + if letters: + res = set() + for ch in letters: + idx = ord(ch.upper()) - ord("A") + if 0 <= idx < num_choices: + res.add(idx) + return res if res else None + + # 数字集合:如 "1,3,4" + nums = re.findall(r"\b(\d+)\b", s) + if nums: + res = set() + for n in nums: + v = int(n) + if 0 <= v < num_choices: + res.add(v) + elif 1 <= v <= num_choices: + res.add(v - 1) + return res if res else None + + return None + + # ----------------------------- + # micro-F1 / Jaccard + # ----------------------------- + def _set_metrics(self, pred: set, gold: set) -> Dict[str, float]: + if pred is None or gold is None: + return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "jaccard": 0.0, "exact_set": 0.0} + inter = len(pred & gold) + p = inter / len(pred) if len(pred) > 0 else 0.0 + r = inter / len(gold) if len(gold) > 0 else 0.0 + f1 = (2 * p * r / (p + r)) if (p + r) > 0 else 0.0 + j = inter / len(pred | gold) if len(pred | gold) > 0 else 0.0 + exact = 1.0 if pred == gold else 0.0 + return {"precision": float(p), "recall": float(r), "f1": float(f1), "jaccard": float(j), "exact_set": float(exact)} + + # ----------------------------- + # LLM loglikelihood 适配(尽量兼容不同 serving 实现) + # ----------------------------- + def _ll_batch(self, prompts: List[str], continuations: List[str]) -> Optional[List[float]]: + if self.llm_serving is None: + return None + + # 尝试常见方法名 + cand_names = [ + "loglikelihood_batch", + "loglikelihood", + "get_loglikelihood_batch", + "get_loglikelihood", + "score_batch", + "score", + ] + for name in cand_names: + if hasattr(self.llm_serving, name): + fn = getattr(self.llm_serving, name) + try: + # 兼容多种签名: (prompts, continuations) / (pairs) + try: + return fn(prompts=prompts, continuations=continuations) # type: ignore + except TypeError: + try: + return fn(prompts, continuations) # type: ignore + except TypeError: + pairs = list(zip(prompts, continuations)) + return fn(pairs) # type: ignore + except Exception as e: + self.logger.error(f"llm_serving.{name} failed: {e}") + return None + + self.logger.error("llm_serving does not provide any loglikelihood/score interface.") + return None + + def _ppl_batch(self, texts: List[str]) -> Optional[List[float]]: + if self.llm_serving is None: + return None + + model_id = getattr(self.llm_serving, "real_model_path", None) or getattr(self.llm_serving, "hf_model_name_or_path", None) + hf_cache_dir = getattr(self.llm_serving, "hf_cache_dir", None) + trust_remote_code = getattr(self.llm_serving, "trust_remote_code", True) + + if model_id is None: + self.logger.error("llm_serving does not expose real_model_path/hf_model_name_or_path; cannot compute ppl.") + return None + + try: + tokenizer = getattr(self, "_ppl_hf_tokenizer", None) + model = getattr(self, "_ppl_hf_model", None) + loaded_id = getattr(self, "_ppl_hf_model_id", None) + if tokenizer is None or model is None or loaded_id != model_id: + tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code) + model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + self._ppl_hf_tokenizer = tokenizer + self._ppl_hf_model = model + self._ppl_hf_model_id = model_id + except Exception as e: + self.logger.error(f"failed to load hf model/tokenizer for ppl: {e}") + return None + + try: + device = next(model.parameters()).device + batch_size = 4 + ppls: List[float] = [] + max_len = getattr(getattr(model, "config", None), "max_position_embeddings", None) + + for start in range(0, len(texts), batch_size): + batch_texts = ["" if t is None else str(t) for t in texts[start:start + batch_size]] + enc = tokenizer( + batch_texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_len, + ) + input_ids = enc["input_ids"].to(device) + attention_mask = enc.get("attention_mask", None) + if attention_mask is not None: + attention_mask = attention_mask.to(device) + + with torch.no_grad(): + logits = model(input_ids=input_ids, attention_mask=attention_mask).logits + + shift_logits = logits[:, :-1, :].contiguous() + shift_labels = input_ids[:, 1:].contiguous() + + if attention_mask is None: + shift_mask = torch.ones_like(shift_labels, dtype=torch.float32, device=device) + else: + shift_mask = attention_mask[:, 1:].to(dtype=torch.float32) + + vocab_size = shift_logits.size(-1) + token_nll = F.cross_entropy( + shift_logits.view(-1, vocab_size), + shift_labels.view(-1), + reduction="none", + ).view(shift_labels.size(0), -1) + + nll_sum = (token_nll * shift_mask).sum(dim=1) + denom = shift_mask.sum(dim=1).clamp_min(1.0) + ppl_batch = torch.exp(nll_sum / denom).detach().cpu().tolist() + ppls.extend([float(x) for x in ppl_batch]) + + return ppls + except Exception as e: + self.logger.error(f"hf ppl computation failed: {e}") + return None + + # ----------------------------- + # 统计落盘 + # ----------------------------- + def _save_stats(self, bench_name_or_prefix: str, stats: Dict[str, Any]) -> None: + os.makedirs(os.path.dirname(self.eval_result_path), exist_ok=True) + df = pd.DataFrame([stats]) + df.to_json(self.eval_result_path, orient="records", force_ascii=False, indent=2) + self.logger.success(f"Statistics saved to {self.eval_result_path}") + + # ----------------------------- + # 主入口 + # ----------------------------- + def run( + self, + storage: DataFlowStorage, + keys_map: Optional[Dict[str, str]] = None, + context_key: Optional[str] = None, + input_pred_key: str = "generated_ans", + ) -> List[str]: + """ + keys_map 示例: + - key1_text_score: {"text": "text"} + - key2_qa: {"question":"question", "target":"golden_answer"} + - key2_q_ma: {"question":"question", "targets":"gold_answers"} + - key3_q_choices_a: {"question":"question", "choices":"choices", "label":"label"} + - key3_q_choices_as: {"question":"question", "choices":"choices", "labels":"labels"} + - key3_q_a_rejected: {"question":"question", "better":"chosen", "rejected":"rejected"} + """ + df = storage.read("dataframe") + eval_type = self.eval_type + + + + # 输出列统一 + if "eval_valid" not in df.columns: + df["eval_valid"] = True + df["eval_error"] = "" + df["eval_pred"] = None + df["eval_score"] = np.nan # 数值型评分(accuracy 类用 0/1) + + # 默认 metric + metric_type = self.metric_type + if metric_type is None: + metric_type = self._default_metric_for_type(eval_type, self.use_semantic_judge) + + if keys_map is None: + self.logger.error("keys_map is required.") + storage.write(df) + return ["eval_valid", "eval_error", "eval_pred", "eval_score"] + + # context 处理:统一读一列(可无) + ctx_series = None + if context_key is not None: + if context_key not in df.columns: + self.logger.error(f"context_key '{context_key}' not found; treat as None.") + else: + ctx_series = df[context_key] + + # 分发 + if eval_type == "key1_text_score": + required = [keys_map.get("text", "")] + if not self._check_columns(df, required): + storage.write(df) + return required + + text_col = keys_map["text"] + texts = [str(x) if x is not None else "" for x in df[text_col].tolist()] + ppl = self._ppl_batch(texts) + if ppl is None: + df["eval_valid"] = False + df["eval_error"] = "ppl_unavailable" + storage.write(df) + self._save_stats(storage.file_name_prefix, { + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + "total_samples": len(df), + "valid_samples": 0, + "note": "ppl unavailable in llm_serving", + }) + return [text_col, "eval_score", "eval_valid", "eval_error"] + + df["eval_score"] = ppl + df["eval_pred"] = None + df["eval_valid"] = True + storage.write(df) + + stats = { + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + "total_samples": int(len(df)), + "valid_samples": int(len(df)), + "ppl_mean": float(np.mean(ppl)) if len(ppl) else 0.0, + } + self._save_stats(storage.file_name_prefix, stats) + return [text_col, "eval_score", "eval_valid", "eval_error"] + + elif eval_type in ("key2_qa", "key2_q_ma"): + # QA:默认走 math_verify 抽取+对比(可选 semantic_judge) + # 单参考:target + # 多参考:targets + question_col = keys_map.get("question", "") + if eval_type == "key2_qa": + target_col = keys_map.get("target", "") + required = [question_col, target_col, input_pred_key] + if not self._check_columns(df, required): + storage.write(df) + return required + + self._eval_qa_single( + df=df, + question_col=question_col, + target_col=target_col, + pred_col=input_pred_key, + ctx_series=ctx_series, + metric_type=metric_type, + ) + storage.write(df) + + stats = self._stats_for_binary(df) + stats.update({ + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + }) + self._save_stats(storage.file_name_prefix, stats) + return [question_col, target_col, input_pred_key, "eval_score", "eval_valid", "eval_error"] + + else: + targets_col = keys_map.get("targets", "") + required = [question_col, targets_col, input_pred_key] + if not self._check_columns(df, required): + storage.write(df) + return required + + self._eval_qa_multi( + df=df, + question_col=question_col, + targets_col=targets_col, + pred_col=input_pred_key, + ctx_series=ctx_series, + metric_type=metric_type, + ) + storage.write(df) + + stats = self._stats_for_binary(df) + stats.update({ + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + }) + self._save_stats(storage.file_name_prefix, stats) + return [question_col, targets_col, input_pred_key, "eval_score", "eval_valid", "eval_error"] + + elif eval_type == "key3_q_choices_a": + question_col = keys_map.get("question", "") + choices_col = keys_map.get("choices", "") + label_col = keys_map.get("label", "") + required = [question_col, choices_col, label_col] + # 若没有 llm_serving,则 fallback 需要 pred_col + if self.llm_serving is None: + required.append(input_pred_key) + + if not self._check_columns(df, required): + storage.write(df) + return required + + self._eval_mc_single( + df=df, + question_col=question_col, + choices_col=choices_col, + label_col=label_col, + ctx_series=ctx_series, + metric_type=metric_type, + pred_col=input_pred_key, + ) + storage.write(df) + + stats = self._stats_for_binary(df) + stats.update({ + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + }) + self._save_stats(storage.file_name_prefix, stats) + return [question_col, choices_col, label_col, "eval_score", "eval_valid", "eval_error"] + + elif eval_type == "key3_q_choices_as": + question_col = keys_map.get("question", "") + choices_col = keys_map.get("choices", "") + labels_col = keys_map.get("labels", "") + required = [question_col, choices_col, labels_col, input_pred_key] # 先按“解析模型输出集合”实现 + if not self._check_columns(df, required): + storage.write(df) + return required + + self._eval_mc_multi( + df=df, + question_col=question_col, + choices_col=choices_col, + labels_col=labels_col, + pred_col=input_pred_key, + metric_type=metric_type, + ) + storage.write(df) + + stats = self._stats_for_multiselect(df) + stats.update({ + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + }) + self._save_stats(storage.file_name_prefix, stats) + return [question_col, choices_col, labels_col, input_pred_key, "eval_score", "eval_valid", "eval_error"] + + elif eval_type == "key3_q_a_rejected": + question_col = keys_map.get("question", "") + better_col = keys_map.get("better", "") + rejected_col = keys_map.get("rejected", "") + required = [question_col, better_col, rejected_col] + if not self._check_columns(df, required): + storage.write(df) + return required + + if self.llm_serving is None: + # 这个类型没有 pred_col 可 fallback,只能报错 + df["eval_valid"] = False + df["eval_error"] = "llm_serving_required_for_pairwise" + storage.write(df) + stats = { + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + "total_samples": int(len(df)), + "valid_samples": 0, + "note": "pairwise requires llm_serving loglikelihood", + } + self._save_stats(storage.file_name_prefix, stats) + return required + ["eval_score", "eval_valid", "eval_error"] + + self._eval_pairwise( + df=df, + question_col=question_col, + better_col=better_col, + rejected_col=rejected_col, + ctx_series=ctx_series, + metric_type=metric_type, + ) + storage.write(df) + + stats = self._stats_for_binary(df) + stats.update({ + "bench_name_or_prefix": storage.file_name_prefix, + "type": eval_type, + "metric": metric_type, + }) + self._save_stats(storage.file_name_prefix, stats) + return required + ["eval_score", "eval_valid", "eval_error"] + + else: + self.logger.error(f"Unknown bench_dataflow_eval_type: {eval_type}") + storage.write(df) + return ["eval_valid", "eval_error", "eval_pred", "eval_score"] + + # ----------------------------- + # 默认 metric + # ----------------------------- + def _default_metric_for_type(self, t: str, use_semantic_judge: bool) -> str: + if t == "key1_text_score": + return "ppl" + if t == "key2_qa": + return "semantic_judge" if use_semantic_judge else "math_verify" + if t == "key2_q_ma": + return "any_math_verify" + if t == "key3_q_choices_a": + return "ll_choice_acc" + if t == "key3_q_choices_as": + return "micro_f1" + if t == "key3_q_a_rejected": + return "pairwise_ll_winrate" + return "unknown" + + # ----------------------------- + # 统计:binary(0/1) + # ----------------------------- + def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]: + total = len(df) + valid_mask = df["eval_valid"] == True + valid = int(valid_mask.sum()) + # eval_score: 0/1 + if valid > 0: + acc = float(df.loc[valid_mask, "eval_score"].mean()) + else: + acc = 0.0 + return { + "total_samples": int(total), + "valid_samples": int(valid), + "accuracy": float(acc), + } + + # ----------------------------- + # 统计:多选(f1/jaccard 等) + # ----------------------------- + def _stats_for_multiselect(self, df: pd.DataFrame) -> Dict[str, Any]: + total = len(df) + valid_mask = df["eval_valid"] == True + valid = int(valid_mask.sum()) + # eval_score 默认存 f1 + if valid > 0: + f1_mean = float(df.loc[valid_mask, "eval_score"].mean()) + else: + f1_mean = 0.0 + # 如果你想要更多维度(jaccard/exact_set),可以从 eval_pred 里扩展存 dict,这里先给最小 + return { + "total_samples": int(total), + "valid_samples": int(valid), + "micro_f1_mean": float(f1_mean), + } + + # ----------------------------- + # key2_qa:单参考 + # ----------------------------- + def _eval_qa_single( + self, + df: pd.DataFrame, + question_col: str, + target_col: str, + pred_col: str, + ctx_series: Optional[pd.Series], + metric_type: str, + ) -> None: + if metric_type == "semantic_judge": + # 语义 judge 需要 llm_serving.generate_from_input + if self.llm_serving is None or not hasattr(self.llm_serving, "generate_from_input"): + self.logger.error("semantic_judge requires llm_serving.generate_from_input") + df["eval_valid"] = False + df["eval_error"] = "semantic_judge_unavailable" + return + + # 默认用“预测 vs 标准”直接 judge(你旧逻辑那套需要特定 Prompt,这里只做通用;你可自行替换为你自己的 AnswerJudgePrompt) + inputs = [] + row_indices = [] + for idx, row in df.iterrows(): + gt = row[target_col] + pred = row[pred_col] + if gt is None or (isinstance(gt, str) and gt.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_reference" + continue + if pred is None or (isinstance(pred, str) and pred.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_prediction" + continue + + prompt = ( + "You are an evaluator. Decide if the prediction is correct given the reference.\n" + f"Reference:\n{gt}\n\nPrediction:\n{pred}\n\n" + 'Return JSON: {"judgement_result": true/false}' + ) + inputs.append(prompt) + row_indices.append(idx) + + if not inputs: + return + + try: + responses = self.llm_serving.generate_from_input(user_inputs=inputs, system_prompt=self.system_prompt) + except Exception as e: + self.logger.error(f"semantic_judge generate_from_input failed: {e}") + for idx in row_indices: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "semantic_judge_failed" + return + + for idx, resp in zip(row_indices, responses): + ok = self._resolve_judge_response(resp) + df.at[idx, "eval_score"] = 1.0 if ok else 0.0 + df.at[idx, "eval_pred"] = None + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + return + + # 默认:math_verify + for idx, row in df.iterrows(): + gt = row[target_col] + pred_raw = row[pred_col] + if gt is None or (isinstance(gt, str) and gt.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_reference" + continue + if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_prediction" + continue + + final_answer = self.answer_extractor.extract_answer(pred_raw, None) + ok = self._math_verify_compare(final_answer, gt) + df.at[idx, "eval_score"] = 1.0 if ok else 0.0 + df.at[idx, "eval_pred"] = str(final_answer) + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + # ----------------------------- + # key2_q_ma:多参考 + # ----------------------------- + def _eval_qa_multi( + self, + df: pd.DataFrame, + question_col: str, + targets_col: str, + pred_col: str, + ctx_series: Optional[pd.Series], + metric_type: str, + ) -> None: + # 默认:any_math_verify + for idx, row in df.iterrows(): + targets_raw = row[targets_col] + pred_raw = row[pred_col] + targets = self._normalize_targets(targets_raw) + + if len(targets) == 0: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_references" + continue + if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_prediction" + continue + + final_answer = self.answer_extractor.extract_answer(pred_raw, None) + ok_any = False + for gt in targets: + if self._math_verify_compare(final_answer, gt): + ok_any = True + break + + df.at[idx, "eval_score"] = 1.0 if ok_any else 0.0 + df.at[idx, "eval_pred"] = str(final_answer) + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + # ----------------------------- + # key3_q_choices_a:单选 + # ----------------------------- + def _eval_mc_single( + self, + df: pd.DataFrame, + question_col: str, + choices_col: str, + label_col: str, + ctx_series: Optional[pd.Series], + metric_type: str, + pred_col: str, + ) -> None: + # 优先:loglikelihood + if metric_type == "ll_choice_acc" and self.llm_serving is not None: + # 批量做:每行要对 choices 逐个算 ll,先实现清晰版(你后面可优化 batching) + for idx, row in df.iterrows(): + q = row[question_col] + choices = row[choices_col] + label = row[label_col] + + if choices is None or (isinstance(choices, float) and np.isnan(choices)): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_choices" + continue + if not isinstance(choices, list): + # 尝试 json + try: + choices = json.loads(str(choices)) + except Exception: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "choices_not_list" + continue + if len(choices) == 0: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_choices" + continue + + ctx = None + if ctx_series is not None: + ctx = self._normalize_context(ctx_series.loc[idx]) + + prompt = self._build_prompt(question=str(q), context=ctx, choices=[str(c) for c in choices], task="mc_single") + + # label 规范化为 idx + gold_idx = self._normalize_label_to_index(label, len(choices)) + if gold_idx is None: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "invalid_label" + continue + + prompts = [prompt] * len(choices) + conts = [] + for c in choices: + c_str = str(c) + # 常见做法:continuation 前补空格,避免直接拼在 Answer: 后面太粘连 + conts.append((" " + c_str) if (len(prompt) > 0 and not prompt.endswith((" ", "\n"))) else c_str) + + lls = self._ll_batch(prompts, conts) + if lls is None: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "ll_unavailable" + continue + + pred_idx = int(np.argmax(np.array(lls))) + df.at[idx, "eval_pred"] = int(pred_idx) + df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0 + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + return + + # fallback:从 pred_col 解析(generation 输出里抓 A/B/C 或数字) + self.logger.warning("ll_choice_acc unavailable; fallback to parse generated output for single-choice.") + for idx, row in df.iterrows(): + choices = row[choices_col] + label = row[label_col] + pred_text = row[pred_col] if pred_col in df.columns else None + + if choices is None: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_choices" + continue + if not isinstance(choices, list): + try: + choices = json.loads(str(choices)) + except Exception: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "choices_not_list" + continue + + gold_idx = self._normalize_label_to_index(label, len(choices)) + pred_idx = self._parse_choice_from_text(str(pred_text), len(choices)) if pred_text is not None else None + if gold_idx is None or pred_idx is None: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "parse_failed" + continue + + df.at[idx, "eval_pred"] = int(pred_idx) + df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0 + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + def _normalize_label_to_index(self, label: Any, n: int) -> Optional[int]: + if label is None: + return None + # 若 label 本身是 int + if isinstance(label, (int, np.integer)): + v = int(label) + if 0 <= v < n: + return v + if 1 <= v <= n: + return v - 1 + return None + s = str(label).strip() + if not s: + return None + # A/B/C + if len(s) == 1 and s.isalpha(): + idx = ord(s.upper()) - ord("A") + return idx if 0 <= idx < n else None + # 数字 + if s.isdigit(): + v = int(s) + if 0 <= v < n: + return v + if 1 <= v <= n: + return v - 1 + return None + + # ----------------------------- + # key3_q_choices_as:多选 + # ----------------------------- + def _eval_mc_multi( + self, + df: pd.DataFrame, + question_col: str, + choices_col: str, + labels_col: str, + pred_col: str, + metric_type: str, + ) -> None: + # 这里按你说的“先最小落地”:从 pred_col 解析集合 -> micro_f1 + for idx, row in df.iterrows(): + choices = row[choices_col] + gold = row[labels_col] + pred_text = row[pred_col] + + if choices is None: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_choices" + continue + if not isinstance(choices, list): + try: + choices = json.loads(str(choices)) + except Exception: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "choices_not_list" + continue + + n = len(choices) + gold_set = self._normalize_multilabel_to_set(gold, n) + pred_set = self._parse_multiselect_set(str(pred_text), n) + + if gold_set is None or pred_set is None: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "parse_failed" + continue + + m = self._set_metrics(pred_set, gold_set) + # eval_score 默认存 f1(你的上层聚合最常用) + df.at[idx, "eval_score"] = float(m["f1"]) + # eval_pred 存更丰富的信息,便于 debug + df.at[idx, "eval_pred"] = json.dumps( + {"pred_set": sorted(list(pred_set)), "gold_set": sorted(list(gold_set)), **m}, + ensure_ascii=False, + ) + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + def _normalize_multilabel_to_set(self, labels: Any, n: int) -> Optional[set]: + if labels is None: + return None + if isinstance(labels, float) and np.isnan(labels): + return None + if isinstance(labels, list): + s = set() + for x in labels: + idx = self._normalize_label_to_index(x, n) + if idx is None: + continue + s.add(idx) + return s if len(s) > 0 else set() + + s = str(labels).strip() + if not s: + return None + # json list + if s.startswith("[") and s.endswith("]"): + try: + obj = json.loads(s) + if isinstance(obj, list): + res = set() + for x in obj: + idx = self._normalize_label_to_index(x, n) + if idx is not None: + res.add(idx) + return res + except Exception: + pass + + # 分隔符 + parts = re.split(r"[,\s;/|]+", s) + res = set() + for p in parts: + p = p.strip() + if not p: + continue + idx = self._normalize_label_to_index(p, n) + if idx is not None: + res.add(idx) + return res if len(res) > 0 else set() + + # ----------------------------- + # key3_q_a_rejected:偏好对比 + # ----------------------------- + def _eval_pairwise( + self, + df: pd.DataFrame, + question_col: str, + better_col: str, + rejected_col: str, + ctx_series: Optional[pd.Series], + metric_type: str, + ) -> None: + # 默认:pairwise_ll_winrate + for idx, row in df.iterrows(): + q = row[question_col] + better = row[better_col] + rej = row[rejected_col] + + if better is None or (isinstance(better, str) and better.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_better" + continue + if rej is None or (isinstance(rej, str) and rej.strip() == ""): + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "empty_rejected" + continue + + ctx = None + if ctx_series is not None: + ctx = self._normalize_context(ctx_series.loc[idx]) + + prompt = self._build_prompt(question=str(q), context=ctx, task="pairwise") + + prompts = [prompt, prompt] + conts = [] + better_s = str(better) + rej_s = str(rej) + conts.append((" " + better_s) if (len(prompt) > 0 and not prompt.endswith((" ", "\n"))) else better_s) + conts.append((" " + rej_s) if (len(prompt) > 0 and not prompt.endswith((" ", "\n"))) else rej_s) + + lls = self._ll_batch(prompts, conts) + if lls is None or len(lls) != 2: + df.at[idx, "eval_valid"] = False + df.at[idx, "eval_error"] = "ll_unavailable" + continue + + win = 1.0 if float(lls[0]) > float(lls[1]) else 0.0 + df.at[idx, "eval_score"] = win + df.at[idx, "eval_pred"] = json.dumps({"ll_better": float(lls[0]), "ll_rejected": float(lls[1])}, ensure_ascii=False) + df.at[idx, "eval_valid"] = True + df.at[idx, "eval_error"] = "" + + # ----------------------------- + # 语义 judge 响应解析(兼容你旧逻辑) + # ----------------------------- + def _resolve_judge_response(self, response: Any) -> bool: + if response is None or (isinstance(response, str) and response.strip() == ""): + self.empty_responses_count += 1 + return False + try: + s = str(response) + # 尝试 json + try: + obj = json.loads(s) + if isinstance(obj, dict) and "judgement_result" in obj: + return bool(obj["judgement_result"]) + except Exception: + pass + + pattern = re.compile(r'"judgement_result"\s*:\s*(true|false)', re.IGNORECASE) + m = pattern.search(s) + if m: + return m.group(1).lower() == "true" + # fallback + return ("true" in s.lower()) and ("false" not in s.lower()) + except Exception as e: + self.logger.error(f"Response format error: {response}. Error: {e}") + return False + + # ----------------------------- + # 描述 + # ----------------------------- + @staticmethod + def get_desc(lang: str = "zh"): + if lang == "zh": + return ( + "统一 Bench 评测算子:支持 6 类纯文本评测范式。\n\n" + "支持类型:\n" + "- key1_text_score(默认 ppl)\n" + "- key2_qa(默认 math_verify / 可选 semantic_judge)\n" + "- key2_q_ma(默认 any_math_verify)\n" + "- key3_q_choices_a(默认 ll_choice_acc,若无 ll 接口则 fallback 解析生成)\n" + "- key3_q_choices_as(默认 micro_f1:解析多选集合后算 F1)\n" + "- key3_q_a_rejected(默认 pairwise_ll_winrate)\n\n" + "统一输出列:eval_score / eval_pred / eval_valid / eval_error,并支持统计落盘。" + ) + return ( + "Unified bench evaluator supporting 6 text-only task archetypes.\n" + "Outputs: eval_score / eval_pred / eval_valid / eval_error with stats saved." + ) diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py new file mode 100644 index 00000000..48976f72 --- /dev/null +++ b/dataflow/operators/core_text/generate/bench_answer_generator.py @@ -0,0 +1,250 @@ +from __future__ import annotations + +import json +import re +from typing import Any, Dict, List, Literal, Optional, Union + +import numpy as np +import pandas as pd + +from dataflow import get_logger +from dataflow.core import OperatorABC, LLMServingABC +from dataflow.core.prompt import DIYPromptABC, prompt_restrict +from dataflow.utils.registry import OPERATOR_REGISTRY +from dataflow.utils.storage import DataFlowStorage + + +@prompt_restrict() # 保持通用, 不强绑固定 prompt 类 + +@OPERATOR_REGISTRY.register() +class BenchAnswerGenerator(OperatorABC): + """ + 用于 bench 评测的统一生成算子, 与 UnifiedBenchDatasetEvaluator 参数对齐 + + 输入: + - eval_type: 评测类型, 取值同 evaluator + - keys_map: 指定各字段名, 同 evaluator + - context_key: 可选, 上下文字段名, 不传则 None + 输出: + - output_key: 生成结果列, 默认 generated_ans + - 对于不需要生成的类型, 默认不写 output_key, 直接返回空列表 + """ + + def __init__( + self, + llm_serving: LLMServingABC, + prompt_template: Optional[Union[DIYPromptABC, Any]] = None, + system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.", + allow_overwrite: bool = False, + # 是否强制对所有类型都生成, 默认只对需要 pred 的类型生成 + force_generate: bool = False, + ): + self.logger = get_logger() + self.llm_serving = llm_serving + self.prompt_template = prompt_template + self.system_prompt = system_prompt + self.allow_overwrite = allow_overwrite + self.force_generate = force_generate + + # ---------- 工具函数 ---------- + def _normalize_context(self, ctx: Any) -> Optional[str]: + if ctx is None: + return None + if isinstance(ctx, float) and np.isnan(ctx): + return None + if isinstance(ctx, list): + parts = [] + for x in ctx: + if x is None: + continue + s = str(x).strip() + if s: + parts.append(s) + return "\n".join(parts) if parts else None + s = str(ctx).strip() + return s if s else None + + def _ensure_list(self, v: Any) -> Optional[List[str]]: + if v is None: + return None + if isinstance(v, float) and np.isnan(v): + return None + if isinstance(v, list): + return [str(x) for x in v] + s = str(v).strip() + if not s: + return None + # 尝试 json list + if s.startswith("[") and s.endswith("]"): + try: + obj = json.loads(s) + if isinstance(obj, list): + return [str(x) for x in obj] + except Exception: + pass + return None + + def _format_choices_text(self, choices: List[str]) -> str: + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + lines = [] + for i, c in enumerate(choices): + tag = letters[i] if i < len(letters) else str(i) + lines.append(f"{tag}. {c}") + return "\n".join(lines) + + def _build_prompt_fallback( + self, + *, + eval_type: str, + question: Optional[str], + context: Optional[str], + choices: Optional[List[str]], + ) -> str: + ctx_block = f"Context:\n{context}\n\n" if context else "" + q_block = f"Question:\n{(question or '').strip()}\n\n" + + if eval_type in ("key2_qa", "key2_q_ma"): + return f"{ctx_block}{q_block}Answer:" + if eval_type == "key3_q_choices_a": + ch = self._format_choices_text(choices or []) + return f"{ctx_block}{q_block}Choices:\n{ch}\n\nChoose exactly one option. Output only the option letter (e.g., A).\nAnswer:" + if eval_type == "key3_q_choices_as": + ch = self._format_choices_text(choices or []) + return ( + f"{ctx_block}{q_block}Choices:\n{ch}\n\n" + "This is a multi-select question. Output JSON only, format: {\"choices\": [\"A\",\"C\"]}.\nAnswer:" + ) + # key1_text_score / key3_q_a_rejected 默认不需要生成 + return f"{ctx_block}{q_block}Answer:" + + def _build_prompt( + self, + *, + eval_type: str, + question: Optional[str], + context: Optional[str], + choices: Optional[List[str]], + ) -> str: + if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"): + try: + return self.prompt_template.build_prompt( + eval_type=eval_type, + question=question, + context=context, + choices=choices, + choices_text=self._format_choices_text(choices) if choices else None, + ) + except Exception as e: + self.logger.error(f"prompt_template.build_prompt 失败, fallback 默认模板: {e}") + return self._build_prompt_fallback(eval_type=eval_type, question=question, context=context, choices=choices) + + def _call_generate(self, prompts: List[str]) -> List[str]: + if not hasattr(self.llm_serving, "generate_from_input"): + self.logger.error("llm_serving 缺少 generate_from_input 接口") + return [""] * len(prompts) + try: + # 兼容有无 system_prompt 参数 + try: + return self.llm_serving.generate_from_input(user_inputs=prompts, system_prompt=self.system_prompt) + except TypeError: + return self.llm_serving.generate_from_input(prompts) + except Exception as e: + self.logger.error(f"generate_from_input 执行失败: {e}") + return [""] * len(prompts) + + def _need_generation(self, eval_type: str) -> bool: + # evaluator 当前实现里: + # - key1_text_score: 不需要 generated_ans + # - key2_qa / key2_q_ma: 需要 generated_ans + # - key3_q_choices_a: 若 evaluator 用 ll 则不需要; 但为了可测试/兜底, 这里默认生成 + # - key3_q_choices_as: evaluator 当前用解析 generated_ans -> 需要 + # - key3_q_a_rejected: evaluator 用 ll 比较 better vs rejected -> 不需要 + if self.force_generate: + return eval_type != "key1_text_score" + return eval_type in ("key2_qa", "key2_q_ma", "key3_q_choices_a", "key3_q_choices_as") + + # ---------- 主入口 ---------- + def run( + self, + storage: DataFlowStorage, + eval_type: Literal[ + "key1_text_score", + "key2_qa", + "key2_q_ma", + "key3_q_choices_a", + "key3_q_choices_as", + "key3_q_a_rejected", + ], + keys_map: Dict[str, str], + context_key: Optional[str] = None, + output_key: str = "generated_ans", + ) -> List[str]: + df = storage.read("dataframe") + + if not self._need_generation(eval_type): + self.logger.info(f"[BenchAnswerGenerator] eval_type={eval_type} 默认不需要生成, 跳过") + storage.write(df) + return [] + + if (output_key in df.columns) and (not self.allow_overwrite): + self.logger.error(f"输出列已存在且不允许覆盖: {output_key}") + storage.write(df) + return [] + + # 读取字段 + q_col = keys_map.get("question") + if not q_col or q_col not in df.columns: + self.logger.error(f"缺少 question 列, keys_map.question={q_col}") + storage.write(df) + return [] + + ch_col = keys_map.get("choices") + need_choices = eval_type in ("key3_q_choices_a", "key3_q_choices_as") + if need_choices and (not ch_col or ch_col not in df.columns): + self.logger.error(f"缺少 choices 列, keys_map.choices={ch_col}") + storage.write(df) + return [] + + ctx_series = None + if context_key: + if context_key in df.columns: + ctx_series = df[context_key] + else: + self.logger.error(f"context_key 不存在: {context_key}, 视为 None") + + prompts: List[str] = [] + for idx, row in df.iterrows(): + q = row[q_col] + ctx = self._normalize_context(ctx_series.loc[idx]) if ctx_series is not None else None + + choices = None + if need_choices: + choices = self._ensure_list(row[ch_col]) + if not choices: + # choices 为空, 仍然生成一个可追踪的输出, 避免整体崩 + choices = [""] + + prompts.append( + self._build_prompt( + eval_type=eval_type, + question=str(q) if q is not None else "", + context=ctx, + choices=choices, + ) + ) + + answers = self._call_generate(prompts) + df[output_key] = answers + out_file = storage.write(df) + self.logger.info(f"[BenchAnswerGenerator] 生成完成, 保存到 {out_file}") + return [output_key] + + @staticmethod + def get_desc(lang: str = "zh"): + if lang == "zh": + return ( + "用于 bench 评测的统一生成算子, 与 evaluator 的 eval_type + keys_map 对齐。\n" + "默认只对需要生成输出的类型生成 output_key=generated_ans, 并支持 context_key 作为可选上下文。\n" + "可通过 allow_overwrite 控制是否覆盖已存在的输出列。" + ) + return "Unified bench answer generator aligned with evaluator eval_type and keys_map." \ No newline at end of file diff --git a/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py b/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py new file mode 100644 index 00000000..67de5601 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py @@ -0,0 +1,66 @@ +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +DIY_PROMPT_ANSWER = """Please output the answer.""" + +class UnifiedBenchEvalPipeline(): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + + self.storage = FileStorage( + first_entry_file_name="../example_data/core_text_data/bench_eval_data.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/scy/Model/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + prompt_template=None, + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type="key1_text_score", + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map={"text": "text"}, + context_key=None, + output_key="generated_ans", + ) + """ + all types: + "key1_text_score", + "key2_qa", + "key2_q_ma", + "key3_q_choices_a", + "key3_q_choices_as", + "key3_q_a_rejected", + """ + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map={"text": "text"}, + context_key=None, + input_pred_key="generated_ans", + + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.forward() From da0404fa52837cb4d1c9344d3f1e0626768e941a Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 5 Jan 2026 23:27:43 +0800 Subject: [PATCH 2/6] add all types of evaluation of model --- .../unified_bench_eval_type1.jsonl | 3 +- .../unified_bench_eval_type2.jsonl | 15 ++ .../unified_bench_eval_type3.jsonl | 6 + .../unified_bench_eval_type4.jsonl | 10 ++ .../unified_bench_eval_type5.jsonl | 10 ++ .../unified_bench_eval_type6.jsonl | 10 ++ dataflow/operators/core_text/__init__.py | 1 + .../eval/unified_bench_dataset_evaluator.py | 158 ++++++++++++++++-- .../generate/bench_answer_generator.py | 54 +++--- .../unified_bench_eval_pipeline.py | 35 ++-- .../unified_bench_eval_type1.py | 71 ++++++++ .../unified_bench_eval_type2.py | 81 +++++++++ .../unified_bench_eval_type3.py | 85 ++++++++++ .../unified_bench_eval_type4.py | 84 ++++++++++ .../unified_bench_eval_type5.py | 84 ++++++++++ .../unified_bench_eval_type6.py | 84 ++++++++++ 16 files changed, 744 insertions(+), 47 deletions(-) create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type2.jsonl create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type3.jsonl create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type4.jsonl create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type5.jsonl create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type6.jsonl rename dataflow/statics/pipelines/gpu_pipelines/{ => benchmark_eval}/unified_bench_eval_pipeline.py (76%) create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py diff --git a/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl index a76d9002..1d66ba16 100644 --- a/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl +++ b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl @@ -2,5 +2,4 @@ {"text": "The capital of France is Paris."} {"text": "Please evaluate the language model perplexity on this short example."} {"text": "Machine learning enables computers to learn patterns from data."} -{"text": "Perplexity is a common metric for evaluating language models on text scoring tasks."} -{"text": "666233gigity"} \ No newline at end of file +{"text": "Perplexity is a common metric for evaluating language models on text scoring tasks."} \ No newline at end of file diff --git a/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl new file mode 100644 index 00000000..54f28264 --- /dev/null +++ b/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl @@ -0,0 +1,15 @@ +{"id": "math_001", "question": "Solve for x: 2x + 3 = 11.", "model_answer": "Let's isolate x. Subtract 3 from both sides: 2x = 8. Then divide both sides by 2 to get x = 4. Therefore, the final answer is x = 4.", "golden_label": "x = 4"} +{"id": "math_002", "question": "If a circle has a radius of 5, what is its area?", "model_answer": "We know the formula for the area of a circle is 2πr. Plugging in r = 5 gives area = 10π. Hence, the area is 10π square units.", "golden_label": "25π"} +{"id": "math_003", "question": "Find the derivative of f(x) = x².", "model_answer": "Using the power rule, the derivative of x² is 2x¹ = 2x. Therefore, f'(x) = 2x.", "golden_label": "2x"} +{"id": "math_004", "question": "Solve 3x - 7 = 11.", "model_answer": "Start by adding 7 to both sides: 3x = 18. Divide both sides by 3 to isolate x, which gives x = 6. So the correct solution is x = 6.", "golden_label": "x = 6"} +{"id": "math_005", "question": "A triangle has sides of lengths 3, 4, and 5. What is its area?", "model_answer": "Since 3² + 4² = 5², it’s a right triangle. The area of a right triangle is ½ × base × height = ½ × 3 × 4 = 6. Hence, the area is 6 square units.", "golden_label": "6"} +{"id": "math_006", "question": "Find the vertex of the parabola y = x² + 2x + 1.", "model_answer": "The vertex formula is (-b/2a, c - b²/4a). Here, a = 1, b = 2, c = 1. Compute: -b/2a = -2/2 = -1, and y = 1 - 1 = 0. Wait, I think the vertex is (1, 0).", "golden_label": "(-1, 0)"} +{"id": "math_007", "question": "Simplify (x² - 1)/(x - 1).", "model_answer": "Factor the numerator: (x + 1)(x - 1)/(x - 1). The (x - 1) terms cancel, leaving x + 1. Therefore, the simplified result is x + 1.", "golden_label": "x + 1"} +{"id": "math_008", "question": "Evaluate log₂(8).", "model_answer": "We know that 2² = 8, so log₂(8) = 2. Therefore, the result is 2.", "golden_label": "3"} +{"id": "math_009", "question": "Find the indefinite integral ∫x dx.", "model_answer": "The integral of x with respect to x is x²/2 + C. So the answer is x²/2 + C.", "golden_label": "x²/2 + C"} +{"id": "math_010", "question": "Compute the slope of the line passing through (1, 2) and (4, 8).", "model_answer": "Slope m = (y₂ - y₁)/(x₂ - x₁) = (8 - 2)/(4 - 1) = 6/3 = 2. Hence, the slope is 2.", "golden_label": "2"} +{"id": "math_011", "question": "Expand (x + 3)².", "model_answer": "Using the binomial theorem: (x + 3)² = x² + 2×x×3 + 3² = x² + 6x + 9. Therefore, the expanded form is x² + 6x + 9.", "golden_label": "x² + 6x + 9"} +{"id": "math_012", "question": "What is sin(90°)?", "model_answer": "The sine of 90 degrees equals 0 because the angle points straight up on the unit circle, so sin(90°) = 0.", "golden_label": "1"} +{"id": "math_013", "question": "Find the determinant of [[2, 3], [1, 4]].", "model_answer": "The determinant is ad - bc = 2×4 - 3×1 = 8 - 3 = 5. Therefore, det = 5.", "golden_label": "5"} +{"id": "math_014", "question": "Compute 7 × 8.", "model_answer": "7 × 8 equals 54. I multiplied 7 × 7 = 49 and added 5, giving 54.", "golden_label": "56"} +{"id": "math_015", "question": "Find the derivative of f(x) = sin(x).", "model_answer": "The derivative of sin(x) with respect to x is cos(x). Thus, f'(x) = cos(x).", "golden_label": "cos(x)"} diff --git a/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl new file mode 100644 index 00000000..042755a4 --- /dev/null +++ b/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl @@ -0,0 +1,6 @@ +{"eval_type":"key2_q_ma","id":"ma_0001","context":"France's capital city is Paris.","question":"What is the capital of France?","targets":["Paris","The capital of France is Paris."]} +{"eval_type":"key2_q_ma","id":"ma_0002","context":"The chemical symbol for water is H2O.","question":"What is the chemical formula for water?","targets":["H2O","h2o"]} +{"eval_type":"key2_q_ma","id":"ma_0003","context":"Python is a popular programming language created by Guido van Rossum.","question":"Who created Python?","targets":["Guido van Rossum","Guido"]} +{"eval_type":"key2_q_ma","id":"ma_0004","context":"The largest planet in our solar system is Jupiter.","question":"Which is the largest planet in the solar system?","targets":["Jupiter","The largest planet is Jupiter."]} +{"eval_type":"key2_q_ma","id":"ma_0005","context":"Light travels at approximately 300,000 kilometers per second in vacuum.","question":"What is the approximate speed of light in vacuum?","targets":["300000 km/s","300,000 km/s","3e5 km/s","approximately 300,000 kilometers per second"]} +{"eval_type":"key2_q_ma","id":"ma_0006","context":"Shakespeare wrote the tragedy Hamlet.","question":"Who wrote Hamlet?","targets":["William Shakespeare","Shakespeare"]} diff --git a/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl new file mode 100644 index 00000000..db5d04b5 --- /dev/null +++ b/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl @@ -0,0 +1,10 @@ +{"eval_type":"key3_q_choices_a","id":"mc_0001","context":null,"question":"What is the capital of France?","choices":["Paris","London","Berlin","Rome"],"label":0} +{"eval_type":"key3_q_choices_a","id":"mc_0002","context":null,"question":"In Python, what does len([1, 2, 3]) return?","choices":["2","3","4","An error"],"label":1} +{"eval_type":"key3_q_choices_a","id":"mc_0003","context":"Assume standard SI units.","question":"Which physical quantity is measured in Newtons (N)?","choices":["Energy","Force","Power","Voltage"],"label":1} +{"eval_type":"key3_q_choices_a","id":"mc_0004","context":null,"question":"Which planet is the largest in the Solar System?","choices":["Earth","Mars","Jupiter","Venus"],"label":2} +{"eval_type":"key3_q_choices_a","id":"mc_0005","context":"Consider basic probability with a fair six-sided die.","question":"What is the probability of rolling a 6?","choices":["1/2","1/3","1/6","1/12"],"label":2} +{"eval_type":"key3_q_choices_a","id":"mc_0006","context":"You are reading an English sentence.","question":"Choose the word that best completes the sentence: \"She ___ to the store yesterday.\"","choices":["go","goes","went","going"],"label":2} +{"eval_type":"key3_q_choices_a","id":"mc_0007","context":"Pick the most appropriate next step in the sequence.","question":"What is the next number in the sequence 2, 4, 8, 16, ?","choices":["18","24","32","34"],"label":2} +{"eval_type":"key3_q_choices_a","id":"mc_0008","context":"Computer science basics.","question":"Which data structure uses FIFO (first in, first out)?","choices":["Stack","Queue","Tree","Heap"],"label":1} +{"eval_type":"key3_q_choices_a","id":"mc_0009","context":"Geography.","question":"Which ocean is the largest by surface area?","choices":["Indian Ocean","Atlantic Ocean","Arctic Ocean","Pacific Ocean"],"label":3} +{"eval_type":"key3_q_choices_a","id":"mc_0010","context":"Mathematics.","question":"If x = 3, what is the value of 2x + 5?","choices":["8","10","11","12"],"label":2} diff --git a/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl new file mode 100644 index 00000000..c4c8e567 --- /dev/null +++ b/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl @@ -0,0 +1,10 @@ +{"eval_type":"key3_q_choices_as","id":"ms_0001","context":null,"question":"Which of the following are prime numbers?","choices":["2","9","11","15"],"labels":[0,2]} +{"eval_type":"key3_q_choices_as","id":"ms_0002","context":"Select all correct statements about Python.","question":"Which of the following are valid ways to create a list in Python?","choices":["[]","list()","{}","()"],"labels":[0,1]} +{"eval_type":"key3_q_choices_as","id":"ms_0003","context":"Consider basic linear algebra.","question":"Which of the following are valid matrix operations?","choices":["Matrix addition (same shape)","Matrix multiplication (inner dims match)","Element-wise division is always defined","Taking the determinant of a non-square matrix"],"labels":[0,1]} +{"eval_type":"key3_q_choices_as","id":"ms_0004","context":"HTTP request methods are standardized verbs.","question":"Which of the following are HTTP methods?","choices":["GET","FETCH","POST","PUSH"],"labels":[0,2]} +{"eval_type":"key3_q_choices_as","id":"ms_0005","context":"JSON has a small set of primitive and composite types.","question":"Which of the following are valid JSON value types?","choices":["string","tuple","number","object"],"labels":[0,2,3]} +{"eval_type":"key3_q_choices_as","id":"ms_0006","context":"Pick all that apply.","question":"Which tasks are typically supervised learning?","choices":["Image classification","K-means clustering","Linear regression","PCA"],"labels":[0,2]} +{"eval_type":"key3_q_choices_as","id":"ms_0007","context":"Recall basic operating system concepts.","question":"Which of the following are common process scheduling algorithms?","choices":["Round-robin","Shortest Job First","Breadth-first search","First Come First Served"],"labels":[0,1,3]} +{"eval_type":"key3_q_choices_as","id":"ms_0008","context":"Select all true statements about TCP.","question":"Which of the following are features of TCP?","choices":["Connection-oriented","Guarantees in-order delivery","Message boundaries are preserved","Congestion control mechanisms exist"],"labels":[0,1,3]} +{"eval_type":"key3_q_choices_as","id":"ms_0009","context":"Choose all correct options.","question":"Which of the following numbers are divisible by 3?","choices":["21","22","24","25"],"labels":[0,2]} +{"eval_type":"key3_q_choices_as","id":"ms_0010","context":"Basic set theory.","question":"Which of the following sets are subsets of {1,2,3}?","choices":["{1,2}","{2,4}","{ }","{1,2,3,4}"],"labels":[0,2]} diff --git a/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl new file mode 100644 index 00000000..8b562084 --- /dev/null +++ b/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl @@ -0,0 +1,10 @@ +{"eval_type":"key3_q_a_rejected","id":"pw_0001","context":null,"question":"Explain what overfitting is in machine learning.","better":"Overfitting is when a model learns the training data too closely, including noise, so it performs well on training data but poorly on new, unseen data.","rejected":"Overfitting means the model is too good and always performs well everywhere."} +{"eval_type":"key3_q_a_rejected","id":"pw_0002","context":"Answer concisely in one sentence.","question":"What is the capital of France?","better":"Paris.","rejected":"France is a country in Europe with many cities."} +{"eval_type":"key3_q_a_rejected","id":"pw_0003","context":"Provide a clear step-by-step solution.","question":"Solve: 2x + 5 = 11.","better":"Subtract 5 from both sides to get 2x=6, then divide by 2 to get x=3.","rejected":"x is 11 because 2x plus 5 is 11."} +{"eval_type":"key3_q_a_rejected","id":"pw_0004","context":"You are writing documentation for a Python beginner.","question":"What does a Python list represent?","better":"A list is an ordered, mutable collection of items, written with square brackets like [1, 2, 3].","rejected":"A list is like a dictionary but faster and uses curly braces."} +{"eval_type":"key3_q_a_rejected","id":"pw_0005","context":"Give a polite refusal.","question":"Can you share my friend's private phone number?","better":"Sorry, I can’t help with sharing someone’s private contact information without their permission.","rejected":"Sure, tell me their name and I’ll provide the number."} +{"eval_type":"key3_q_a_rejected","id":"pw_0006","context":"Explain in simple terms.","question":"What is an API?","better":"An API is a set of rules that lets different software systems talk to each other, like a menu of requests you can make and the responses you’ll get.","rejected":"An API is a database that stores all your application’s data."} +{"eval_type":"key3_q_a_rejected","id":"pw_0007","context":"Answer with one short paragraph.","question":"Why do we use HTTPS instead of HTTP?","better":"HTTPS encrypts data between your browser and the server, which helps prevent eavesdropping and tampering, improving security and trust.","rejected":"HTTPS is used because it makes websites load faster than HTTP in all cases."} +{"eval_type":"key3_q_a_rejected","id":"pw_0008","context":"Focus on correctness.","question":"What is the derivative of x^2?","better":"The derivative of x^2 with respect to x is 2x.","rejected":"The derivative of x^2 is x."} +{"eval_type":"key3_q_a_rejected","id":"pw_0009","context":"Return a direct answer.","question":"How many bytes are in a kilobyte (KB) in the decimal system?","better":"1 KB is 1000 bytes in the decimal (SI) system.","rejected":"1 KB is always 1024 bytes no matter what."} +{"eval_type":"key3_q_a_rejected","id":"pw_0010","context":"Be clear and practical.","question":"How can you reduce Python virtual environment dependency conflicts?","better":"Pin dependencies with exact versions, use a lock file when possible, isolate projects per environment, and upgrade packages in a controlled way.","rejected":"Just install everything globally; conflicts will resolve themselves."} diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py index 2ed5a57e..592469c9 100644 --- a/dataflow/operators/core_text/__init__.py +++ b/dataflow/operators/core_text/__init__.py @@ -11,6 +11,7 @@ from .generate.bench_answer_generator import BenchAnswerGenerator from .eval.bench_dataset_evaluator import BenchDatasetEvaluator from .eval.bench_dataset_evaluator_question import BenchDatasetEvaluatorQuestion + from .eval.unified_bench_dataset_evaluator import UnifiedBenchDatasetEvaluator from .eval.text2qa_sample_evaluator import Text2QASampleEvaluator from .eval.prompted_eval import PromptedEvaluator from .filter.prompted_filter import PromptedFilter diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py index 6a09990b..e0199dc6 100644 --- a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py +++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py @@ -4,6 +4,7 @@ import os import re import time +import unicodedata from dataclasses import dataclass from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union import numpy as np @@ -189,14 +190,47 @@ def _build_prompt( # ----------------------------- # math_verify compare # ----------------------------- - def _math_verify_compare(self, answer: Any, ground_truth: Any) -> bool: + def _try_math_verify_compare(self, answer: Any, ground_truth: Any) -> Optional[bool]: try: return verify(parse(str(ground_truth)), parse(str(answer))) except Exception: try: return verify(parse(ground_truth), parse(answer)) except Exception: - return False + return None + + def _math_verify_compare(self, answer: Any, ground_truth: Any) -> bool: + res = self._try_math_verify_compare(answer, ground_truth) + return bool(res) if res is not None else False + + def _normalize_text_for_match(self, text: Any) -> str: + if text is None: + return "" + s = unicodedata.normalize("NFKC", str(text)) + s = s.translate(str.maketrans({ + "₀": "0", + "₁": "1", + "₂": "2", + "₃": "3", + "₄": "4", + "₅": "5", + "₆": "6", + "₇": "7", + "₈": "8", + "₉": "9", + })) + s = s.strip() + s = re.sub(r"\s+", " ", s) + if s.endswith((".", "。", "!", "!", "?", "?")): + s = s[:-1].strip() + return s.casefold() + + def _text_contains_match(self, pred: Any, ref: Any) -> bool: + p = self._normalize_text_for_match(pred) + r = self._normalize_text_for_match(ref) + if not p or not r: + return False + return (r in p) or (p in r) # ----------------------------- # 多参考答案:把 targets 解析成 List[str] @@ -367,8 +401,106 @@ def _ll_batch(self, prompts: List[str], continuations: List[str]) -> Optional[Li self.logger.error(f"llm_serving.{name} failed: {e}") return None - self.logger.error("llm_serving does not provide any loglikelihood/score interface.") - return None + model_id = getattr(self.llm_serving, "real_model_path", None) or getattr(self.llm_serving, "hf_model_name_or_path", None) + hf_cache_dir = getattr(self.llm_serving, "hf_cache_dir", None) + trust_remote_code = getattr(self.llm_serving, "trust_remote_code", True) + + if model_id is None: + self.logger.error("llm_serving does not expose real_model_path/hf_model_name_or_path; cannot compute loglikelihood.") + return None + + try: + tokenizer = getattr(self, "_ll_hf_tokenizer", None) + model = getattr(self, "_ll_hf_model", None) + loaded_id = getattr(self, "_ll_hf_model_id", None) + if tokenizer is None or model is None or loaded_id != model_id: + tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code) + model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + model.eval() + self._ll_hf_tokenizer = tokenizer + self._ll_hf_model = model + self._ll_hf_model_id = model_id + except Exception as e: + self.logger.error(f"failed to load hf model/tokenizer for loglikelihood: {e}") + return None + + try: + device = next(model.parameters()).device + pad_id = tokenizer.pad_token_id + if pad_id is None: + pad_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0 + + batch_size = 4 + lls: List[float] = [] + + def _safe_ids(text: str) -> List[int]: + return tokenizer(text, add_special_tokens=False).input_ids + + for start in range(0, len(prompts), batch_size): + ps = ["" if p is None else str(p) for p in prompts[start:start + batch_size]] + cs = ["" if c is None else str(c) for c in continuations[start:start + batch_size]] + + full_ids_list: List[List[int]] = [] + prompt_lens: List[int] = [] + cont_lens: List[int] = [] + + for p, c in zip(ps, cs): + full_ids = _safe_ids(p + c) + p_ids = _safe_ids(p) + if len(p_ids) <= len(full_ids) and full_ids[:len(p_ids)] == p_ids: + prompt_len = len(p_ids) + else: + c_ids = _safe_ids(c) + prompt_len = max(0, len(full_ids) - len(c_ids)) + cont_len = max(0, len(full_ids) - prompt_len) + full_ids_list.append(full_ids) + prompt_lens.append(prompt_len) + cont_lens.append(cont_len) + + max_len = max((len(x) for x in full_ids_list), default=0) + if max_len == 0: + lls.extend([0.0] * len(full_ids_list)) + continue + + input_ids = torch.full((len(full_ids_list), max_len), pad_id, dtype=torch.long, device=device) + attention_mask = torch.zeros((len(full_ids_list), max_len), dtype=torch.long, device=device) + for i, ids in enumerate(full_ids_list): + if not ids: + continue + t = torch.tensor(ids, dtype=torch.long, device=device) + input_ids[i, : t.numel()] = t + attention_mask[i, : t.numel()] = 1 + + with torch.no_grad(): + logits = model(input_ids=input_ids, attention_mask=attention_mask).logits + log_probs = F.log_softmax(logits, dim=-1) + + shift_log_probs = log_probs[:, :-1, :].contiguous() + shift_labels = input_ids[:, 1:].contiguous() + token_ll = shift_log_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1) + + for i in range(len(full_ids_list)): + cont_len = cont_lens[i] + prompt_len = prompt_lens[i] + if cont_len <= 0: + lls.append(0.0) + continue + start_pos = max(prompt_len, 1) + end_pos = prompt_len + cont_len + start_idx = start_pos - 1 + end_idx = end_pos - 1 + if end_idx <= start_idx: + lls.append(0.0) + continue + ll_val = float(token_ll[i, start_idx:end_idx].sum().detach().cpu()) + lls.append(ll_val) + + return lls + except Exception as e: + self.logger.error(f"hf loglikelihood computation failed: {e}") + return None def _ppl_batch(self, texts: List[str]) -> Optional[List[float]]: if self.llm_serving is None: @@ -478,8 +610,6 @@ def run( df = storage.read("dataframe") eval_type = self.eval_type - - # 输出列统一 if "eval_valid" not in df.columns: df["eval_valid"] = True @@ -787,7 +917,7 @@ def _eval_qa_single( df["eval_error"] = "semantic_judge_unavailable" return - # 默认用“预测 vs 标准”直接 judge(你旧逻辑那套需要特定 Prompt,这里只做通用;你可自行替换为你自己的 AnswerJudgePrompt) + # 默认用“预测 vs 标准”直接 judge(这里只做通用;可自行替换 AnswerJudgePrompt) inputs = [] row_indices = [] for idx, row in df.iterrows(): @@ -845,9 +975,11 @@ def _eval_qa_single( continue final_answer = self.answer_extractor.extract_answer(pred_raw, None) - ok = self._math_verify_compare(final_answer, gt) + text_ok = self._text_contains_match(pred_raw, gt) or self._text_contains_match(final_answer, gt) + math_res = self._try_math_verify_compare(final_answer, gt) + ok = text_ok or (math_res is True) df.at[idx, "eval_score"] = 1.0 if ok else 0.0 - df.at[idx, "eval_pred"] = str(final_answer) + df.at[idx, "eval_pred"] = str(final_answer) if (math_res is True) else str(pred_raw) df.at[idx, "eval_valid"] = True df.at[idx, "eval_error"] = "" @@ -880,13 +1012,17 @@ def _eval_qa_multi( final_answer = self.answer_extractor.extract_answer(pred_raw, None) ok_any = False + matched_by_text = False for gt in targets: - if self._math_verify_compare(final_answer, gt): + text_ok = self._text_contains_match(pred_raw, gt) or self._text_contains_match(final_answer, gt) + math_res = self._try_math_verify_compare(final_answer, gt) + if text_ok or (math_res is True): ok_any = True + matched_by_text = matched_by_text or text_ok break df.at[idx, "eval_score"] = 1.0 if ok_any else 0.0 - df.at[idx, "eval_pred"] = str(final_answer) + df.at[idx, "eval_pred"] = str(pred_raw) if matched_by_text else str(final_answer) df.at[idx, "eval_valid"] = True df.at[idx, "eval_error"] = "" diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py index 48976f72..bcc619fe 100644 --- a/dataflow/operators/core_text/generate/bench_answer_generator.py +++ b/dataflow/operators/core_text/generate/bench_answer_generator.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import inspect import re from typing import Any, Dict, List, Literal, Optional, Union @@ -32,7 +33,15 @@ class BenchAnswerGenerator(OperatorABC): def __init__( self, - llm_serving: LLMServingABC, + eval_type: Literal[ + "key1_text_score", + "key2_qa", + "key2_q_ma", + "key3_q_choices_a", + "key3_q_choices_as", + "key3_q_a_rejected", + ] = "key2_qa", + llm_serving: Optional[LLMServingABC] = None, prompt_template: Optional[Union[DIYPromptABC, Any]] = None, system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.", allow_overwrite: bool = False, @@ -45,6 +54,7 @@ def __init__( self.system_prompt = system_prompt self.allow_overwrite = allow_overwrite self.force_generate = force_generate + self.eval_type = eval_type # ---------- 工具函数 ---------- def _normalize_context(self, ctx: Any) -> Optional[str]: @@ -127,13 +137,25 @@ def _build_prompt( ) -> str: if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"): try: - return self.prompt_template.build_prompt( - eval_type=eval_type, - question=question, - context=context, - choices=choices, - choices_text=self._format_choices_text(choices) if choices else None, - ) + fn = getattr(self.prompt_template, "build_prompt") + kwargs = { + "eval_type": eval_type, + "question": question, + "context": context, + "choices": choices, + "choices_text": self._format_choices_text(choices) if choices else None, + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + + sig = inspect.signature(fn) + params = sig.parameters.values() + has_varkw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params) + if has_varkw: + return fn(**kwargs) + + accepted = {p.name for p in params if p.name != "self"} + filtered = {k: v for k, v in kwargs.items() if k in accepted} + return fn(**filtered) except Exception as e: self.logger.error(f"prompt_template.build_prompt 失败, fallback 默认模板: {e}") return self._build_prompt_fallback(eval_type=eval_type, question=question, context=context, choices=choices) @@ -156,30 +178,24 @@ def _need_generation(self, eval_type: str) -> bool: # evaluator 当前实现里: # - key1_text_score: 不需要 generated_ans # - key2_qa / key2_q_ma: 需要 generated_ans - # - key3_q_choices_a: 若 evaluator 用 ll 则不需要; 但为了可测试/兜底, 这里默认生成 + # - key3_q_choices_a: evaluator 可用 ll 做选择题评估 -> 默认不生成 # - key3_q_choices_as: evaluator 当前用解析 generated_ans -> 需要 # - key3_q_a_rejected: evaluator 用 ll 比较 better vs rejected -> 不需要 if self.force_generate: return eval_type != "key1_text_score" - return eval_type in ("key2_qa", "key2_q_ma", "key3_q_choices_a", "key3_q_choices_as") + return eval_type in ("key2_qa", "key2_q_ma", "key3_q_choices_as") # ---------- 主入口 ---------- def run( self, storage: DataFlowStorage, - eval_type: Literal[ - "key1_text_score", - "key2_qa", - "key2_q_ma", - "key3_q_choices_a", - "key3_q_choices_as", - "key3_q_a_rejected", - ], keys_map: Dict[str, str], context_key: Optional[str] = None, output_key: str = "generated_ans", ) -> List[str]: + df = storage.read("dataframe") + eval_type = self.eval_type if not self._need_generation(eval_type): self.logger.info(f"[BenchAnswerGenerator] eval_type={eval_type} 默认不需要生成, 跳过") @@ -247,4 +263,4 @@ def get_desc(lang: str = "zh"): "默认只对需要生成输出的类型生成 output_key=generated_ans, 并支持 context_key 作为可选上下文。\n" "可通过 allow_overwrite 控制是否覆盖已存在的输出列。" ) - return "Unified bench answer generator aligned with evaluator eval_type and keys_map." \ No newline at end of file + return "Unified bench answer generator aligned with evaluator eval_type and keys_map." diff --git a/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py similarity index 76% rename from dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py rename to dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py index 67de5601..f18de41b 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py @@ -2,27 +2,40 @@ from dataflow.utils.storage import FileStorage from dataflow.serving import LocalModelLLMServing_vllm from dataflow.core import LLMServingABC - + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + DIY_PROMPT_ANSWER = """Please output the answer.""" +EVAL_TYPE = "key1_text_score" +KEY_MAPS = {"text": "text"} class UnifiedBenchEvalPipeline(): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): self.storage = FileStorage( - first_entry_file_name="../example_data/core_text_data/bench_eval_data.jsonl", + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/scy/Model/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) self.answer_generator_step1 = BenchAnswerGenerator( llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, prompt_template=None, allow_overwrite=False, force_generate=False, @@ -31,7 +44,7 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg self.evaluator_step2 = UnifiedBenchDatasetEvaluator( eval_result_path="./cache_local/eval_result/eval_result.jsonl", llm_serving=self.llm_serving_generator, - eval_type="key1_text_score", + eval_type=EVAL_TYPE, prompt_template=None, use_semantic_judge=False, metric_type=None, # use default metric @@ -40,22 +53,14 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map={"text": "text"}, + keys_map=KEY_MAPS, context_key=None, output_key="generated_ans", ) - """ - all types: - "key1_text_score", - "key2_qa", - "key2_q_ma", - "key3_q_choices_a", - "key3_q_choices_as", - "key3_q_a_rejected", - """ + self.evaluator_step2.run( storage=self.storage.step(), - keys_map={"text": "text"}, + keys_map=KEY_MAPS, context_key=None, input_pred_key="generated_ans", diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py new file mode 100644 index 00000000..f18de41b --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py @@ -0,0 +1,71 @@ +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +DIY_PROMPT_ANSWER = """Please output the answer.""" +EVAL_TYPE = "key1_text_score" +KEY_MAPS = {"text": "text"} + +class UnifiedBenchEvalPipeline(): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + + self.storage = FileStorage( + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + input_pred_key="generated_ans", + + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.forward() diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py new file mode 100644 index 00000000..14271498 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py @@ -0,0 +1,81 @@ +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.core.prompt import DIYPromptABC +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +EVAL_TYPE = "key2_qa" +KEY_MAPS = { + "question": "question", + "target": "golden_label" +} + +class AnswerGeneratePromptDIY(DIYPromptABC): + def build_prompt(self, question:str = None): + prompt = f""" + Question: {question} + Answer: + """ + return prompt + +class UnifiedBenchEvalPipeline(): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + + self.storage = FileStorage( + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=AnswerGeneratePromptDIY(), + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + input_pred_key="generated_ans", + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.forward() diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py new file mode 100644 index 00000000..e69fca06 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py @@ -0,0 +1,85 @@ +from dataflow.pipeline.Pipeline import PipelineABC +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.core.prompt import DIYPromptABC +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +EVAL_TYPE = "key2_q_ma" +KEY_MAPS = { + "context": "context", # optional + "question": "question", + "targets": "targets" +} + +class AnswerGeneratePromptDIY(DIYPromptABC): + def build_prompt(self, question:str = None): + prompt = f""" + Question: {question} + Answer: + """ + return prompt + +class UnifiedBenchEvalPipeline(PipelineABC): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + super().__init__() + + self.storage = FileStorage( + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=AnswerGeneratePromptDIY(), + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + input_pred_key="generated_ans", + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.compile() + pl.forward() diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py new file mode 100644 index 00000000..d522f265 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py @@ -0,0 +1,84 @@ +from dataflow.pipeline.Pipeline import PipelineABC +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.core.prompt import DIYPromptABC +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +EVAL_TYPE = "key3_q_choices_a" +KEY_MAPS = { + "context": "context", # optional + "question": "question", + "choices": "choices", + "label": "label" +} + +class MMLUPromptDIY(DIYPromptABC): + def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs): + ctx = f"Context:\n{context}\n\n" if context else "" + return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:" + + +class UnifiedBenchEvalPipeline(PipelineABC): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + super().__init__() + + self.storage = FileStorage( + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=MMLUPromptDIY(), + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + input_pred_key="generated_ans", + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.compile() + pl.forward() diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py new file mode 100644 index 00000000..90ca3f05 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py @@ -0,0 +1,84 @@ +from dataflow.pipeline.Pipeline import PipelineABC +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.core.prompt import DIYPromptABC +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +EVAL_TYPE = "key3_q_choices_as" +KEY_MAPS = { + "context": "context", # optional + "question": "question", + "choices": "choices", + "labels": "labels" +} + +class MMLUPromptDIY(DIYPromptABC): + def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs): + ctx = f"Context:\n{context}\n\n" if context else "" + return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:" + + +class UnifiedBenchEvalPipeline(PipelineABC): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + super().__init__() + + self.storage = FileStorage( + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=MMLUPromptDIY(), + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + input_pred_key="generated_ans", + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.compile() + pl.forward() diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py new file mode 100644 index 00000000..18aad7d1 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py @@ -0,0 +1,84 @@ +from dataflow.pipeline.Pipeline import PipelineABC +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.core.prompt import DIYPromptABC +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +EVAL_TYPE = "key3_q_a_rejected" +KEY_MAPS = { + "context": "context", # optional + "question": "question", + "better": "better", + "rejected": "rejected" +} + +class PreferencePairwisePromptDIY(DIYPromptABC): + def build_prompt(self, question: str = None, context: str = None, **kwargs): + ctx = f"Context:\n{context}\n\n" if context else "" + return f"{ctx}Question:\n{question}\n\nAnswer:" + + +class UnifiedBenchEvalPipeline(PipelineABC): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + super().__init__() + + self.storage = FileStorage( + first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=PreferencePairwisePromptDIY(), + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=False, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + keys_map=KEY_MAPS, + context_key=None, + input_pred_key="generated_ans", + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.compile() + pl.forward() From 2024daa61eb066c7bfacd3eb80d6946e844f302b Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 5 Jan 2026 23:29:41 +0800 Subject: [PATCH 3/6] fix bug in local llm serving when cleanup vllm but not started before --- dataflow/serving/local_model_llm_serving.py | 45 ++++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/dataflow/serving/local_model_llm_serving.py b/dataflow/serving/local_model_llm_serving.py index 74cc4e38..ad469d86 100644 --- a/dataflow/serving/local_model_llm_serving.py +++ b/dataflow/serving/local_model_llm_serving.py @@ -179,16 +179,34 @@ def generate_embedding_from_input(self, texts: list[str]) -> list[list[float]]: return [output.outputs.embedding for output in outputs] def cleanup(self): - free_mem = torch.cuda.mem_get_info()[0] # 返回可用显存(单位:字节) - total_mem = torch.cuda.get_device_properties(0).total_memory - self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB") - self.logger.info("Cleaning up vLLM backend resources...") self.backend_initialized = False + + if torch.cuda.is_available(): + free_mem = torch.cuda.mem_get_info()[0] + total_mem = torch.cuda.get_device_properties(0).total_memory + self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB") + + self.logger.info("Cleaning up vLLM backend resources...") + + if not hasattr(self, "llm") or self.llm is None: + import gc + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + try: + import ray + ray.shutdown() + except Exception: + pass + return + from vllm.distributed.parallel_state import ( destroy_model_parallel, destroy_distributed_environment, ) - del self.llm.llm_engine + + if hasattr(self.llm, "llm_engine"): + del self.llm.llm_engine del self.llm destroy_model_parallel() destroy_distributed_environment() @@ -196,13 +214,18 @@ def cleanup(self): torch.distributed.destroy_process_group() import gc gc.collect() - torch.cuda.empty_cache() - import ray - ray.shutdown() - free_mem = torch.cuda.mem_get_info()[0] # 返回可用显存(单位:字节) - total_mem = torch.cuda.get_device_properties(0).total_memory + if torch.cuda.is_available(): + torch.cuda.empty_cache() + try: + import ray + ray.shutdown() + except Exception: + pass - self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB") + if torch.cuda.is_available(): + free_mem = torch.cuda.mem_get_info()[0] + total_mem = torch.cuda.get_device_properties(0).total_memory + self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB") class LocalModelLLMServing_sglang(LLMServingABC): def __init__( From 3d8472b58b7fd68ef9cfddb0305d94626744aa7d Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Mon, 5 Jan 2026 23:34:07 +0800 Subject: [PATCH 4/6] remove useless pl --- .../unified_bench_eval_pipeline.py | 71 ------------------- 1 file changed, 71 deletions(-) delete mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py deleted file mode 100644 index f18de41b..00000000 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py +++ /dev/null @@ -1,71 +0,0 @@ -from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator -from dataflow.utils.storage import FileStorage -from dataflow.serving import LocalModelLLMServing_vllm -from dataflow.core import LLMServingABC - -""" -all types: -"key1_text_score", -"key2_qa", -"key2_q_ma", -"key3_q_choices_a", -"key3_q_choices_as", -"key3_q_a_rejected", -""" - -DIY_PROMPT_ANSWER = """Please output the answer.""" -EVAL_TYPE = "key1_text_score" -KEY_MAPS = {"text": "text"} - -class UnifiedBenchEvalPipeline(): - def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): - - self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl", - cache_path="./cache_local", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", - ) - - self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path - vllm_tensor_parallel_size=1, - vllm_max_tokens=2048, - ) - - self.answer_generator_step1 = BenchAnswerGenerator( - llm_serving=self.llm_serving_generator, - eval_type=EVAL_TYPE, - prompt_template=None, - allow_overwrite=False, - force_generate=False, - ) - - self.evaluator_step2 = UnifiedBenchDatasetEvaluator( - eval_result_path="./cache_local/eval_result/eval_result.jsonl", - llm_serving=self.llm_serving_generator, - eval_type=EVAL_TYPE, - prompt_template=None, - use_semantic_judge=False, - metric_type=None, # use default metric - ) - - def forward(self): - self.answer_generator_step1.run( - storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, - output_key="generated_ans", - ) - - self.evaluator_step2.run( - storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, - input_pred_key="generated_ans", - - ) - -if __name__ == "__main__": - pl = UnifiedBenchEvalPipeline() - pl.forward() From bdc5d87d2309a5bf8d62f1ad8dde24cf97bcc1d6 Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Tue, 6 Jan 2026 12:59:25 +0800 Subject: [PATCH 5/6] fix prompt_template bug in self test --- .../eval/unified_bench_dataset_evaluator.py | 314 ++++++++++-------- .../generate/bench_answer_generator.py | 62 +++- .../unified_bench_eval_type1.py | 8 +- .../unified_bench_eval_type2.py | 8 +- .../unified_bench_eval_type3.py | 8 +- .../unified_bench_eval_type4.py | 8 +- .../unified_bench_eval_type5.py | 8 +- .../unified_bench_eval_type6.py | 8 +- 8 files changed, 250 insertions(+), 174 deletions(-) diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py index e0199dc6..1958a4a0 100644 --- a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py +++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import json import os import re @@ -594,9 +592,13 @@ def _save_stats(self, bench_name_or_prefix: str, stats: Dict[str, Any]) -> None: def run( self, storage: DataFlowStorage, - keys_map: Optional[Dict[str, str]] = None, - context_key: Optional[str] = None, + input_keys_map: Optional[Dict[str, str]] = None, + input_context_key: Optional[str] = None, input_pred_key: str = "generated_ans", + output_eval_valid_key: str = "eval_valid", + output_eval_error_key: str = "eval_error", + output_eval_pred_key: str = "eval_pred", + output_eval_score_key: str = "eval_score", ) -> List[str]: """ keys_map 示例: @@ -611,43 +613,43 @@ def run( eval_type = self.eval_type # 输出列统一 - if "eval_valid" not in df.columns: - df["eval_valid"] = True - df["eval_error"] = "" - df["eval_pred"] = None - df["eval_score"] = np.nan # 数值型评分(accuracy 类用 0/1) + if output_eval_valid_key not in df.columns: + df[output_eval_valid_key] = True + df[output_eval_error_key] = "" + df[output_eval_pred_key] = None + df[output_eval_score_key] = np.nan # 数值型评分(accuracy 类用 0/1) # 默认 metric metric_type = self.metric_type if metric_type is None: metric_type = self._default_metric_for_type(eval_type, self.use_semantic_judge) - if keys_map is None: + if input_keys_map is None: self.logger.error("keys_map is required.") storage.write(df) - return ["eval_valid", "eval_error", "eval_pred", "eval_score"] + return [output_eval_valid_key, output_eval_error_key, output_eval_pred_key, output_eval_score_key] # context 处理:统一读一列(可无) ctx_series = None - if context_key is not None: - if context_key not in df.columns: - self.logger.error(f"context_key '{context_key}' not found; treat as None.") + if input_context_key is not None: + if input_context_key not in df.columns: + self.logger.error(f"context_key '{input_context_key}' not found; treat as None.") else: - ctx_series = df[context_key] + ctx_series = df[input_context_key] # 分发 if eval_type == "key1_text_score": - required = [keys_map.get("text", "")] + required = [input_keys_map.get("text", "")] if not self._check_columns(df, required): storage.write(df) return required - text_col = keys_map["text"] + text_col = input_keys_map["text"] texts = [str(x) if x is not None else "" for x in df[text_col].tolist()] ppl = self._ppl_batch(texts) if ppl is None: - df["eval_valid"] = False - df["eval_error"] = "ppl_unavailable" + df[output_eval_valid_key] = False + df[output_eval_error_key] = "ppl_unavailable" storage.write(df) self._save_stats(storage.file_name_prefix, { "bench_name_or_prefix": storage.file_name_prefix, @@ -657,11 +659,11 @@ def run( "valid_samples": 0, "note": "ppl unavailable in llm_serving", }) - return [text_col, "eval_score", "eval_valid", "eval_error"] + return [text_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key] - df["eval_score"] = ppl - df["eval_pred"] = None - df["eval_valid"] = True + df[output_eval_score_key] = ppl + df[output_eval_pred_key] = None + df[output_eval_valid_key] = True storage.write(df) stats = { @@ -673,15 +675,15 @@ def run( "ppl_mean": float(np.mean(ppl)) if len(ppl) else 0.0, } self._save_stats(storage.file_name_prefix, stats) - return [text_col, "eval_score", "eval_valid", "eval_error"] + return [text_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type in ("key2_qa", "key2_q_ma"): # QA:默认走 math_verify 抽取+对比(可选 semantic_judge) # 单参考:target # 多参考:targets - question_col = keys_map.get("question", "") + question_col = input_keys_map.get("question", "") if eval_type == "key2_qa": - target_col = keys_map.get("target", "") + target_col = input_keys_map.get("target", "") required = [question_col, target_col, input_pred_key] if not self._check_columns(df, required): storage.write(df) @@ -704,10 +706,10 @@ def run( "metric": metric_type, }) self._save_stats(storage.file_name_prefix, stats) - return [question_col, target_col, input_pred_key, "eval_score", "eval_valid", "eval_error"] + return [question_col, target_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] else: - targets_col = keys_map.get("targets", "") + targets_col = input_keys_map.get("targets", "") required = [question_col, targets_col, input_pred_key] if not self._check_columns(df, required): storage.write(df) @@ -730,12 +732,12 @@ def run( "metric": metric_type, }) self._save_stats(storage.file_name_prefix, stats) - return [question_col, targets_col, input_pred_key, "eval_score", "eval_valid", "eval_error"] + return [question_col, targets_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type == "key3_q_choices_a": - question_col = keys_map.get("question", "") - choices_col = keys_map.get("choices", "") - label_col = keys_map.get("label", "") + question_col = input_keys_map.get("question", "") + choices_col = input_keys_map.get("choices", "") + label_col = input_keys_map.get("label", "") required = [question_col, choices_col, label_col] # 若没有 llm_serving,则 fallback 需要 pred_col if self.llm_serving is None: @@ -763,12 +765,12 @@ def run( "metric": metric_type, }) self._save_stats(storage.file_name_prefix, stats) - return [question_col, choices_col, label_col, "eval_score", "eval_valid", "eval_error"] + return [question_col, choices_col, label_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type == "key3_q_choices_as": - question_col = keys_map.get("question", "") - choices_col = keys_map.get("choices", "") - labels_col = keys_map.get("labels", "") + question_col = input_keys_map.get("question", "") + choices_col = input_keys_map.get("choices", "") + labels_col = input_keys_map.get("labels", "") required = [question_col, choices_col, labels_col, input_pred_key] # 先按“解析模型输出集合”实现 if not self._check_columns(df, required): storage.write(df) @@ -791,12 +793,12 @@ def run( "metric": metric_type, }) self._save_stats(storage.file_name_prefix, stats) - return [question_col, choices_col, labels_col, input_pred_key, "eval_score", "eval_valid", "eval_error"] + return [question_col, choices_col, labels_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type == "key3_q_a_rejected": - question_col = keys_map.get("question", "") - better_col = keys_map.get("better", "") - rejected_col = keys_map.get("rejected", "") + question_col = input_keys_map.get("question", "") + better_col = input_keys_map.get("better", "") + rejected_col = input_keys_map.get("rejected", "") required = [question_col, better_col, rejected_col] if not self._check_columns(df, required): storage.write(df) @@ -804,8 +806,9 @@ def run( if self.llm_serving is None: # 这个类型没有 pred_col 可 fallback,只能报错 - df["eval_valid"] = False - df["eval_error"] = "llm_serving_required_for_pairwise" + self.logger.error("llm_serving is required for pairwise evaluation") + df[output_eval_valid_key] = False + df[output_eval_error_key] = "llm_serving_required_for_pairwise" storage.write(df) stats = { "bench_name_or_prefix": storage.file_name_prefix, @@ -816,7 +819,7 @@ def run( "note": "pairwise requires llm_serving loglikelihood", } self._save_stats(storage.file_name_prefix, stats) - return required + ["eval_score", "eval_valid", "eval_error"] + return required + [output_eval_score_key, output_eval_valid_key, output_eval_error_key] self._eval_pairwise( df=df, @@ -835,12 +838,12 @@ def run( "metric": metric_type, }) self._save_stats(storage.file_name_prefix, stats) - return required + ["eval_score", "eval_valid", "eval_error"] + return required + [output_eval_score_key, output_eval_valid_key, output_eval_error_key] else: self.logger.error(f"Unknown bench_dataflow_eval_type: {eval_type}") storage.write(df) - return ["eval_valid", "eval_error", "eval_pred", "eval_score"] + return [output_eval_valid_key, output_eval_error_key, input_pred_key, output_eval_score_key] # ----------------------------- # 默认 metric @@ -865,11 +868,11 @@ def _default_metric_for_type(self, t: str, use_semantic_judge: bool) -> str: # ----------------------------- def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]: total = len(df) - valid_mask = df["eval_valid"] == True + valid_mask = df[output_eval_valid_key] == True valid = int(valid_mask.sum()) # eval_score: 0/1 if valid > 0: - acc = float(df.loc[valid_mask, "eval_score"].mean()) + acc = float(df.loc[valid_mask, output_eval_score_key].mean()) else: acc = 0.0 return { @@ -883,11 +886,11 @@ def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]: # ----------------------------- def _stats_for_multiselect(self, df: pd.DataFrame) -> Dict[str, Any]: total = len(df) - valid_mask = df["eval_valid"] == True + valid_mask = df[output_eval_valid_key] == True valid = int(valid_mask.sum()) # eval_score 默认存 f1 if valid > 0: - f1_mean = float(df.loc[valid_mask, "eval_score"].mean()) + f1_mean = float(df.loc[valid_mask, output_eval_score_key].mean()) else: f1_mean = 0.0 # 如果你想要更多维度(jaccard/exact_set),可以从 eval_pred 里扩展存 dict,这里先给最小 @@ -913,8 +916,8 @@ def _eval_qa_single( # 语义 judge 需要 llm_serving.generate_from_input if self.llm_serving is None or not hasattr(self.llm_serving, "generate_from_input"): self.logger.error("semantic_judge requires llm_serving.generate_from_input") - df["eval_valid"] = False - df["eval_error"] = "semantic_judge_unavailable" + df[output_eval_valid_key] = False + df[output_eval_error_key] = "semantic_judge_unavailable" return # 默认用“预测 vs 标准”直接 judge(这里只做通用;可自行替换 AnswerJudgePrompt) @@ -924,12 +927,12 @@ def _eval_qa_single( gt = row[target_col] pred = row[pred_col] if gt is None or (isinstance(gt, str) and gt.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_reference" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_reference" continue if pred is None or (isinstance(pred, str) and pred.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_prediction" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_prediction" continue prompt = ( @@ -948,16 +951,16 @@ def _eval_qa_single( except Exception as e: self.logger.error(f"semantic_judge generate_from_input failed: {e}") for idx in row_indices: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "semantic_judge_failed" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "semantic_judge_failed" return for idx, resp in zip(row_indices, responses): ok = self._resolve_judge_response(resp) - df.at[idx, "eval_score"] = 1.0 if ok else 0.0 - df.at[idx, "eval_pred"] = None - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_score_key] = 1.0 if ok else 0.0 + df.at[idx, output_eval_pred_key] = None + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" return @@ -966,22 +969,22 @@ def _eval_qa_single( gt = row[target_col] pred_raw = row[pred_col] if gt is None or (isinstance(gt, str) and gt.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_reference" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_reference" continue if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_prediction" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_prediction" continue final_answer = self.answer_extractor.extract_answer(pred_raw, None) text_ok = self._text_contains_match(pred_raw, gt) or self._text_contains_match(final_answer, gt) math_res = self._try_math_verify_compare(final_answer, gt) ok = text_ok or (math_res is True) - df.at[idx, "eval_score"] = 1.0 if ok else 0.0 - df.at[idx, "eval_pred"] = str(final_answer) if (math_res is True) else str(pred_raw) - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_score_key] = 1.0 if ok else 0.0 + df.at[idx, output_eval_pred_key] = str(final_answer) if (math_res is True) else str(pred_raw) + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" # ----------------------------- # key2_q_ma:多参考 @@ -1002,12 +1005,12 @@ def _eval_qa_multi( targets = self._normalize_targets(targets_raw) if len(targets) == 0: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_references" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_references" continue if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_prediction" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_prediction" continue final_answer = self.answer_extractor.extract_answer(pred_raw, None) @@ -1021,10 +1024,10 @@ def _eval_qa_multi( matched_by_text = matched_by_text or text_ok break - df.at[idx, "eval_score"] = 1.0 if ok_any else 0.0 - df.at[idx, "eval_pred"] = str(pred_raw) if matched_by_text else str(final_answer) - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_score_key] = 1.0 if ok_any else 0.0 + df.at[idx, output_eval_pred_key] = str(pred_raw) if matched_by_text else str(final_answer) + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" # ----------------------------- # key3_q_choices_a:单选 @@ -1048,20 +1051,20 @@ def _eval_mc_single( label = row[label_col] if choices is None or (isinstance(choices, float) and np.isnan(choices)): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_choices" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_choices" continue if not isinstance(choices, list): # 尝试 json try: choices = json.loads(str(choices)) except Exception: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "choices_not_list" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "choices_not_list" continue if len(choices) == 0: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_choices" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_choices" continue ctx = None @@ -1073,8 +1076,8 @@ def _eval_mc_single( # label 规范化为 idx gold_idx = self._normalize_label_to_index(label, len(choices)) if gold_idx is None: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "invalid_label" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "invalid_label" continue prompts = [prompt] * len(choices) @@ -1086,15 +1089,15 @@ def _eval_mc_single( lls = self._ll_batch(prompts, conts) if lls is None: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "ll_unavailable" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "ll_unavailable" continue pred_idx = int(np.argmax(np.array(lls))) - df.at[idx, "eval_pred"] = int(pred_idx) - df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0 - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_pred_key] = int(pred_idx) + df.at[idx, output_eval_score_key] = 1.0 if pred_idx == gold_idx else 0.0 + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" return @@ -1106,28 +1109,28 @@ def _eval_mc_single( pred_text = row[pred_col] if pred_col in df.columns else None if choices is None: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_choices" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_choices" continue if not isinstance(choices, list): try: choices = json.loads(str(choices)) except Exception: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "choices_not_list" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "choices_not_list" continue gold_idx = self._normalize_label_to_index(label, len(choices)) pred_idx = self._parse_choice_from_text(str(pred_text), len(choices)) if pred_text is not None else None if gold_idx is None or pred_idx is None: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "parse_failed" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "parse_failed" continue - df.at[idx, "eval_pred"] = int(pred_idx) - df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0 - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_pred_key] = int(pred_idx) + df.at[idx, output_eval_score_key] = 1.0 if pred_idx == gold_idx else 0.0 + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" def _normalize_label_to_index(self, label: Any, n: int) -> Optional[int]: if label is None: @@ -1175,15 +1178,15 @@ def _eval_mc_multi( pred_text = row[pred_col] if choices is None: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_choices" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_choices" continue if not isinstance(choices, list): try: choices = json.loads(str(choices)) except Exception: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "choices_not_list" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "choices_not_list" continue n = len(choices) @@ -1191,20 +1194,20 @@ def _eval_mc_multi( pred_set = self._parse_multiselect_set(str(pred_text), n) if gold_set is None or pred_set is None: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "parse_failed" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "parse_failed" continue m = self._set_metrics(pred_set, gold_set) # eval_score 默认存 f1(你的上层聚合最常用) - df.at[idx, "eval_score"] = float(m["f1"]) + df.at[idx, output_eval_score_key] = float(m["f1"]) # eval_pred 存更丰富的信息,便于 debug - df.at[idx, "eval_pred"] = json.dumps( + df.at[idx, output_eval_pred_key] = json.dumps( {"pred_set": sorted(list(pred_set)), "gold_set": sorted(list(gold_set)), **m}, ensure_ascii=False, ) - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" def _normalize_multilabel_to_set(self, labels: Any, n: int) -> Optional[set]: if labels is None: @@ -1268,12 +1271,12 @@ def _eval_pairwise( rej = row[rejected_col] if better is None or (isinstance(better, str) and better.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_better" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_better" continue if rej is None or (isinstance(rej, str) and rej.strip() == ""): - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "empty_rejected" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "empty_rejected" continue ctx = None @@ -1291,15 +1294,15 @@ def _eval_pairwise( lls = self._ll_batch(prompts, conts) if lls is None or len(lls) != 2: - df.at[idx, "eval_valid"] = False - df.at[idx, "eval_error"] = "ll_unavailable" + df.at[idx, output_eval_valid_key] = False + df.at[idx, output_eval_error_key] = "ll_unavailable" continue win = 1.0 if float(lls[0]) > float(lls[1]) else 0.0 - df.at[idx, "eval_score"] = win - df.at[idx, "eval_pred"] = json.dumps({"ll_better": float(lls[0]), "ll_rejected": float(lls[1])}, ensure_ascii=False) - df.at[idx, "eval_valid"] = True - df.at[idx, "eval_error"] = "" + df.at[idx, output_eval_score_key] = win + df.at[idx, output_eval_pred_key] = json.dumps({"ll_better": float(lls[0]), "ll_rejected": float(lls[1])}, ensure_ascii=False) + df.at[idx, output_eval_valid_key] = True + df.at[idx, output_eval_error_key] = "" # ----------------------------- # 语义 judge 响应解析(兼容你旧逻辑) @@ -1335,17 +1338,62 @@ def _resolve_judge_response(self, response: Any) -> bool: def get_desc(lang: str = "zh"): if lang == "zh": return ( - "统一 Bench 评测算子:支持 6 类纯文本评测范式。\n\n" - "支持类型:\n" - "- key1_text_score(默认 ppl)\n" - "- key2_qa(默认 math_verify / 可选 semantic_judge)\n" - "- key2_q_ma(默认 any_math_verify)\n" - "- key3_q_choices_a(默认 ll_choice_acc,若无 ll 接口则 fallback 解析生成)\n" - "- key3_q_choices_as(默认 micro_f1:解析多选集合后算 F1)\n" - "- key3_q_a_rejected(默认 pairwise_ll_winrate)\n\n" - "统一输出列:eval_score / eval_pred / eval_valid / eval_error,并支持统计落盘。" + "该算子用于统一 Bench 评测,支持多种任务范式并将评测结果写回 DataFrame,同时输出整体统计到 eval_result_path。\n\n" + "支持类型与默认 metric:\n" + "- key1_text_score:ppl\n" + "- key2_qa:math_verify(或 use_semantic_judge=True 时 semantic_judge)\n" + "- key2_q_ma:any_math_verify(多参考)\n" + "- key3_q_choices_a:ll_choice_acc(基于 loglikelihood;无 serving 接口时使用 HF forward 计算 ll)\n" + "- key3_q_choices_as:micro_f1(解析多选集合后计算)\n" + "- key3_q_a_rejected:pairwise_ll_winrate(基于 ll 比较 better vs rejected)\n\n" + "初始化参数:\n" + "- eval_result_path:统计结果落盘路径\n" + "- eval_type:评测类型(同上)\n" + "- llm_serving:可选;用于 semantic_judge 或提供模型路径信息以进行 PPL/LL 的 HF 计算\n" + "- prompt_template:提示模板对象(可选;需提供 build_prompt;默认使用 AnswerJudgePrompt)\n" + "- system_prompt:语义评测/judge 的系统提示词\n" + "- metric_type:可选;不传则使用 eval_type 的默认 metric\n" + "- use_semantic_judge:仅对 key2_qa 有效;是否使用语义评测\n\n" + "运行参数:\n" + "- storage:DataFlowStorage\n" + "- input_keys_map:字段映射(不同 eval_type 需要不同 key:text/question/target/targets/choices/label/labels/better/rejected)\n" + "- input_context_key:可选,上下文字段名\n" + "- input_pred_key:预测答案字段名(默认 generated_ans)\n\n" + "输出:\n" + "- output_eval_score_key(数值分数)\n" + "- output_eval_pred_key(解析后的预测)\n" + "- output_eval_valid_key(是否有效)\n" + "- output_eval_error_key(错误信息)\n" + "- 保存统计:total_samples/valid_samples/accuracy 或 ppl_mean 等到 eval_result_path\n" + "- 返回本次评测涉及/产出的列名列表" ) return ( - "Unified bench evaluator supporting 6 text-only task archetypes.\n" - "Outputs: eval_score / eval_pred / eval_valid / eval_error with stats saved." - ) + "This operator evaluates unified bench datasets across multiple task archetypes. It writes per-sample results back to the dataframe and saves aggregated statistics to eval_result_path.\n\n" + "Supported Types (default metric):\n" + "- key1_text_score (ppl)\n" + "- key2_qa (math_verify or semantic_judge)\n" + "- key2_q_ma (any_math_verify)\n" + "- key3_q_choices_a (ll_choice_acc)\n" + "- key3_q_choices_as (micro_f1)\n" + "- key3_q_a_rejected (pairwise_ll_winrate)\n\n" + "Input Parameters:\n" + "- eval_result_path: Path to save aggregated statistics\n" + "- eval_type: Evaluation type (one of the supported types)\n" + "- llm_serving: Optional; required for semantic_judge and used as model source for HF-based PPL/LL computation\n" + "- prompt_template: Prompt template object (optional; must provide build_prompt; default is AnswerJudgePrompt)\n" + "- system_prompt: System prompt for semantic judging\n" + "- metric_type: Optional; overrides the default metric for the given eval_type\n" + "- use_semantic_judge: Only for key2_qa; whether to use LLM-based semantic judging\n\n" + "Run Parameters:\n" + "- storage: DataFlowStorage\n" + "- keys_map: Column mapping; depends on eval_type (text/question/target/targets/choices/label/labels/better/rejected)\n" + "- context_key: Optional context column name\n" + "- input_pred_key: Prediction column name (default: generated_ans)\n\n" + "Output Parameters:\n" + f"- output_eval_score_key: Numeric score (accuracy classes use 0/1)\n" + f"- output_eval_pred_key: Parsed prediction\n" + f"- output_eval_valid_key: Whether the sample is valid\n" + f"- output_eval_error_key: Error message if any\n" + "- Saves aggregated stats to eval_result_path\n" + "- Returns a list of involved/output keys" + ) \ No newline at end of file diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py index bcc619fe..796c2033 100644 --- a/dataflow/operators/core_text/generate/bench_answer_generator.py +++ b/dataflow/operators/core_text/generate/bench_answer_generator.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import json import inspect import re @@ -41,11 +39,10 @@ def __init__( "key3_q_choices_as", "key3_q_a_rejected", ] = "key2_qa", - llm_serving: Optional[LLMServingABC] = None, - prompt_template: Optional[Union[DIYPromptABC, Any]] = None, + llm_serving: LLMServingABC = None, + prompt_template: DIYPromptABC = None, system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.", allow_overwrite: bool = False, - # 是否强制对所有类型都生成, 默认只对需要 pred 的类型生成 force_generate: bool = False, ): self.logger = get_logger() @@ -189,8 +186,8 @@ def _need_generation(self, eval_type: str) -> bool: def run( self, storage: DataFlowStorage, - keys_map: Dict[str, str], - context_key: Optional[str] = None, + input_keys_map: Dict[str, str], + input_context_key: Optional[str] = None, output_key: str = "generated_ans", ) -> List[str]: @@ -208,13 +205,13 @@ def run( return [] # 读取字段 - q_col = keys_map.get("question") + q_col = input_keys_map.get("question") if not q_col or q_col not in df.columns: self.logger.error(f"缺少 question 列, keys_map.question={q_col}") storage.write(df) return [] - ch_col = keys_map.get("choices") + ch_col = input_keys_map.get("choices") need_choices = eval_type in ("key3_q_choices_a", "key3_q_choices_as") if need_choices and (not ch_col or ch_col not in df.columns): self.logger.error(f"缺少 choices 列, keys_map.choices={ch_col}") @@ -222,11 +219,11 @@ def run( return [] ctx_series = None - if context_key: - if context_key in df.columns: - ctx_series = df[context_key] + if input_context_key: + if input_context_key in df.columns: + ctx_series = df[input_context_key] else: - self.logger.error(f"context_key 不存在: {context_key}, 视为 None") + self.logger.error(f"context_key 不存在: {input_context_key}, 视为 None") prompts: List[str] = [] for idx, row in df.iterrows(): @@ -259,8 +256,39 @@ def run( def get_desc(lang: str = "zh"): if lang == "zh": return ( - "用于 bench 评测的统一生成算子, 与 evaluator 的 eval_type + keys_map 对齐。\n" - "默认只对需要生成输出的类型生成 output_key=generated_ans, 并支持 context_key 作为可选上下文。\n" - "可通过 allow_overwrite 控制是否覆盖已存在的输出列。" + "该算子用于 bench 评测的统一答案生成,根据 eval_type + keys_map 从 DataFrame 取字段构造 prompt 并批量调用 LLM 生成答案。\n" + "对于默认不需要生成的类型会跳过生成(可用 force_generate 强制)。\n\n" + "初始化参数:\n" + "- eval_type:评测类型(key1_text_score / key2_qa / key2_q_ma / key3_q_choices_a / key3_q_choices_as / key3_q_a_rejected)\n" + "- llm_serving:LLM 服务对象(需提供 generate_from_input)\n" + "- prompt_template:提示模板对象(可选,需提供 build_prompt;否则使用内置 fallback 模板)\n" + "- system_prompt:系统提示词\n" + "- allow_overwrite:输出列已存在时是否允许覆盖\n" + "- force_generate:是否强制对可生成类型都生成\n\n" + "运行参数:\n" + "- storage:DataFlowStorage\n" + "- input_keys_map:字段映射,至少包含 question;选择题需包含 choices\n" + "- input_context_key:可选,上下文字段名\n" + "- output_key:生成结果列名(默认 generated_ans)\n\n" + "输出:\n" + "- 写回 DataFrame 的 output_key 列(若跳过生成则不写)\n" + "- 返回新增/写入的列名列表(通常为 [output_key] 或 [])" ) - return "Unified bench answer generator aligned with evaluator eval_type and keys_map." + return ( + "This operator generates answers for unified bench evaluation by building prompts from a dataframe and calling an LLM.\n\n" + "Input Parameters:\n" + "- eval_type: Evaluation type (key1_text_score/key2_qa/key2_q_ma/key3_q_choices_a/key3_q_choices_as/key3_q_a_rejected)\n" + "- llm_serving: LLM serving object (must provide generate_from_input)\n" + "- prompt_template: Prompt template object (optional; must provide build_prompt; falls back to an internal template)\n" + "- system_prompt: System prompt passed to the serving (if supported)\n" + "- allow_overwrite: Whether to overwrite an existing output column\n" + "- force_generate: Whether to force generation for types that can be skipped by default\n\n" + "Run Parameters:\n" + "- storage: DataFlowStorage\n" + "- keys_map: Column mapping (requires question; for choice tasks requires choices)\n" + "- context_key: Optional context column name\n" + "- output_key: Output column name for generated answers (default: generated_ans)\n\n" + "Output Parameters:\n" + "- Writes output_key into the dataframe when generation is performed\n" + "- Returns a list of written keys (usually [output_key] or [])" + ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py index f18de41b..13941c02 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py @@ -53,15 +53,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py index 14271498..55f580df 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py @@ -64,15 +64,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py index e69fca06..6992e080 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py @@ -67,15 +67,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py index d522f265..c84ccb4a 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py @@ -66,15 +66,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py index 90ca3f05..461c65b9 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py @@ -66,15 +66,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py index 18aad7d1..a5ff886c 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py @@ -66,15 +66,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - keys_map=KEY_MAPS, - context_key=None, + input_keys_map=KEY_MAPS, + input_context_key=None, input_pred_key="generated_ans", ) From 83abcbbbbb39092795c6f5f11ca2d759b60ce498 Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Thu, 8 Jan 2026 22:36:45 +0800 Subject: [PATCH 6/6] using formatstrprompt op instead of coding string prompt in pipeline.py --- .../eval/unified_bench_dataset_evaluator.py | 116 +++++++++++++----- .../generate/bench_answer_generator.py | 74 ++++++++--- .../unified_bench_eval_type1.py | 11 +- .../unified_bench_eval_type2.py | 29 ++--- .../unified_bench_eval_type3.py | 34 +++-- .../unified_bench_eval_type4.py | 35 +++--- .../unified_bench_eval_type5.py | 35 +++--- .../unified_bench_eval_type6.py | 34 +++-- .../unified_bench_eval_type_semantic.py | 83 +++++++++++++ 9 files changed, 299 insertions(+), 152 deletions(-) create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py index 1958a4a0..c90d721c 100644 --- a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py +++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py @@ -41,7 +41,7 @@ class UnifiedBenchDatasetEvaluator(OperatorABC): - key3_q_a_rejected 核心思想: - 只需要传 bench_dataflow_eval_type + metric_type + keys_map + (可选) context_key + 只需要传 bench_dataflow_eval_type + metric_type + input_xxx_key + (可选) context_key - evaluator 内部负责: 1) 读取 dataframe 2) 取 keys @@ -592,7 +592,15 @@ def _save_stats(self, bench_name_or_prefix: str, stats: Dict[str, Any]) -> None: def run( self, storage: DataFlowStorage, - input_keys_map: Optional[Dict[str, str]] = None, + input_text_key: Optional[str] = None, + input_question_key: Optional[str] = None, + input_target_key: Optional[str] = None, + input_targets_key: Optional[str] = None, + input_choices_key: Optional[str] = None, + input_label_key: Optional[str] = None, + input_labels_key: Optional[str] = None, + input_better_key: Optional[str] = None, + input_rejected_key: Optional[str] = None, input_context_key: Optional[str] = None, input_pred_key: str = "generated_ans", output_eval_valid_key: str = "eval_valid", @@ -601,17 +609,22 @@ def run( output_eval_score_key: str = "eval_score", ) -> List[str]: """ - keys_map 示例: - - key1_text_score: {"text": "text"} - - key2_qa: {"question":"question", "target":"golden_answer"} - - key2_q_ma: {"question":"question", "targets":"gold_answers"} - - key3_q_choices_a: {"question":"question", "choices":"choices", "label":"label"} - - key3_q_choices_as: {"question":"question", "choices":"choices", "labels":"labels"} - - key3_q_a_rejected: {"question":"question", "better":"chosen", "rejected":"rejected"} + 字段列名通过 input_xxx_key 显式传入(未传默认 None): + - key1_text_score: input_text_key + - key2_qa: input_question_key + input_target_key + - key2_q_ma: input_question_key + input_targets_key + - key3_q_choices_a: input_question_key + input_choices_key + input_label_key + - key3_q_choices_as: input_question_key + input_choices_key + input_labels_key + - key3_q_a_rejected: input_question_key + input_better_key + input_rejected_key """ df = storage.read("dataframe") eval_type = self.eval_type + self.output_eval_valid_key = output_eval_valid_key + self.output_eval_error_key = output_eval_error_key + self.output_eval_pred_key = output_eval_pred_key + self.output_eval_score_key = output_eval_score_key + # 输出列统一 if output_eval_valid_key not in df.columns: df[output_eval_valid_key] = True @@ -624,10 +637,6 @@ def run( if metric_type is None: metric_type = self._default_metric_for_type(eval_type, self.use_semantic_judge) - if input_keys_map is None: - self.logger.error("keys_map is required.") - storage.write(df) - return [output_eval_valid_key, output_eval_error_key, output_eval_pred_key, output_eval_score_key] # context 处理:统一读一列(可无) ctx_series = None @@ -639,12 +648,12 @@ def run( # 分发 if eval_type == "key1_text_score": - required = [input_keys_map.get("text", "")] + text_col = input_text_key or "" + required = [text_col] if not self._check_columns(df, required): storage.write(df) return required - text_col = input_keys_map["text"] texts = [str(x) if x is not None else "" for x in df[text_col].tolist()] ppl = self._ppl_batch(texts) if ppl is None: @@ -681,9 +690,9 @@ def run( # QA:默认走 math_verify 抽取+对比(可选 semantic_judge) # 单参考:target # 多参考:targets - question_col = input_keys_map.get("question", "") + question_col = input_question_key or "" if eval_type == "key2_qa": - target_col = input_keys_map.get("target", "") + target_col = input_target_key or "" required = [question_col, target_col, input_pred_key] if not self._check_columns(df, required): storage.write(df) @@ -709,7 +718,7 @@ def run( return [question_col, target_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] else: - targets_col = input_keys_map.get("targets", "") + targets_col = input_targets_key or "" required = [question_col, targets_col, input_pred_key] if not self._check_columns(df, required): storage.write(df) @@ -735,9 +744,9 @@ def run( return [question_col, targets_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type == "key3_q_choices_a": - question_col = input_keys_map.get("question", "") - choices_col = input_keys_map.get("choices", "") - label_col = input_keys_map.get("label", "") + question_col = input_question_key or "" + choices_col = input_choices_key or "" + label_col = input_label_key or "" required = [question_col, choices_col, label_col] # 若没有 llm_serving,则 fallback 需要 pred_col if self.llm_serving is None: @@ -768,9 +777,9 @@ def run( return [question_col, choices_col, label_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type == "key3_q_choices_as": - question_col = input_keys_map.get("question", "") - choices_col = input_keys_map.get("choices", "") - labels_col = input_keys_map.get("labels", "") + question_col = input_question_key or "" + choices_col = input_choices_key or "" + labels_col = input_labels_key or "" required = [question_col, choices_col, labels_col, input_pred_key] # 先按“解析模型输出集合”实现 if not self._check_columns(df, required): storage.write(df) @@ -796,9 +805,9 @@ def run( return [question_col, choices_col, labels_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] elif eval_type == "key3_q_a_rejected": - question_col = input_keys_map.get("question", "") - better_col = input_keys_map.get("better", "") - rejected_col = input_keys_map.get("rejected", "") + question_col = input_question_key or "" + better_col = input_better_key or "" + rejected_col = input_rejected_key or "" required = [question_col, better_col, rejected_col] if not self._check_columns(df, required): storage.write(df) @@ -867,6 +876,9 @@ def _default_metric_for_type(self, t: str, use_semantic_judge: bool) -> str: # 统计:binary(0/1) # ----------------------------- def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]: + output_eval_valid_key = self.output_eval_valid_key + output_eval_score_key = self.output_eval_score_key + total = len(df) valid_mask = df[output_eval_valid_key] == True valid = int(valid_mask.sum()) @@ -885,6 +897,9 @@ def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]: # 统计:多选(f1/jaccard 等) # ----------------------------- def _stats_for_multiselect(self, df: pd.DataFrame) -> Dict[str, Any]: + output_eval_valid_key = self.output_eval_valid_key + output_eval_score_key = self.output_eval_score_key + total = len(df) valid_mask = df[output_eval_valid_key] == True valid = int(valid_mask.sum()) @@ -912,6 +927,11 @@ def _eval_qa_single( ctx_series: Optional[pd.Series], metric_type: str, ) -> None: + output_eval_valid_key = self.output_eval_valid_key + output_eval_error_key = self.output_eval_error_key + output_eval_pred_key = self.output_eval_pred_key + output_eval_score_key = self.output_eval_score_key + if metric_type == "semantic_judge": # 语义 judge 需要 llm_serving.generate_from_input if self.llm_serving is None or not hasattr(self.llm_serving, "generate_from_input"): @@ -998,6 +1018,11 @@ def _eval_qa_multi( ctx_series: Optional[pd.Series], metric_type: str, ) -> None: + output_eval_valid_key = self.output_eval_valid_key + output_eval_error_key = self.output_eval_error_key + output_eval_pred_key = self.output_eval_pred_key + output_eval_score_key = self.output_eval_score_key + # 默认:any_math_verify for idx, row in df.iterrows(): targets_raw = row[targets_col] @@ -1042,6 +1067,11 @@ def _eval_mc_single( metric_type: str, pred_col: str, ) -> None: + output_eval_valid_key = self.output_eval_valid_key + output_eval_error_key = self.output_eval_error_key + output_eval_pred_key = self.output_eval_pred_key + output_eval_score_key = self.output_eval_score_key + # 优先:loglikelihood if metric_type == "ll_choice_acc" and self.llm_serving is not None: # 批量做:每行要对 choices 逐个算 ll,先实现清晰版(你后面可优化 batching) @@ -1171,6 +1201,11 @@ def _eval_mc_multi( pred_col: str, metric_type: str, ) -> None: + output_eval_valid_key = self.output_eval_valid_key + output_eval_error_key = self.output_eval_error_key + output_eval_pred_key = self.output_eval_pred_key + output_eval_score_key = self.output_eval_score_key + # 这里按你说的“先最小落地”:从 pred_col 解析集合 -> micro_f1 for idx, row in df.iterrows(): choices = row[choices_col] @@ -1264,6 +1299,11 @@ def _eval_pairwise( ctx_series: Optional[pd.Series], metric_type: str, ) -> None: + output_eval_valid_key = self.output_eval_valid_key + output_eval_error_key = self.output_eval_error_key + output_eval_pred_key = self.output_eval_pred_key + output_eval_score_key = self.output_eval_score_key + # 默认:pairwise_ll_winrate for idx, row in df.iterrows(): q = row[question_col] @@ -1356,7 +1396,15 @@ def get_desc(lang: str = "zh"): "- use_semantic_judge:仅对 key2_qa 有效;是否使用语义评测\n\n" "运行参数:\n" "- storage:DataFlowStorage\n" - "- input_keys_map:字段映射(不同 eval_type 需要不同 key:text/question/target/targets/choices/label/labels/better/rejected)\n" + "- input_text_key:文本列名(key1_text_score)\n" + "- input_question_key:问题列名(key2/key3)\n" + "- input_target_key:单个参考答案列名(key2_qa)\n" + "- input_targets_key:多个参考答案列名(key2_q_ma)\n" + "- input_choices_key:选项列名(key3_q_choices_a/key3_q_choices_as)\n" + "- input_label_key:单个标签列名(key3_q_choices_a)\n" + "- input_labels_key:多个标签列名(key3_q_choices_as)\n" + "- input_better_key:优选答案列名(key3_q_a_rejected)\n" + "- input_rejected_key:劣选答案列名(key3_q_a_rejected)\n" "- input_context_key:可选,上下文字段名\n" "- input_pred_key:预测答案字段名(默认 generated_ans)\n\n" "输出:\n" @@ -1386,8 +1434,16 @@ def get_desc(lang: str = "zh"): "- use_semantic_judge: Only for key2_qa; whether to use LLM-based semantic judging\n\n" "Run Parameters:\n" "- storage: DataFlowStorage\n" - "- keys_map: Column mapping; depends on eval_type (text/question/target/targets/choices/label/labels/better/rejected)\n" - "- context_key: Optional context column name\n" + "- input_text_key: Text column name (key1_text_score)\n" + "- input_question_key: Question column name (key2/key3)\n" + "- input_target_key: Single reference answer column name (key2_qa)\n" + "- input_targets_key: Multiple reference answers column name (key2_q_ma)\n" + "- input_choices_key: Choices column name (key3_q_choices_a/key3_q_choices_as)\n" + "- input_label_key: Single label column name (key3_q_choices_a)\n" + "- input_labels_key: Multiple labels column name (key3_q_choices_as)\n" + "- input_better_key: Better answer column name (key3_q_a_rejected)\n" + "- input_rejected_key: Rejected answer column name (key3_q_a_rejected)\n" + "- input_context_key: Optional context column name\n" "- input_pred_key: Prediction column name (default: generated_ans)\n\n" "Output Parameters:\n" f"- output_eval_score_key: Numeric score (accuracy classes use 0/1)\n" diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py index 796c2033..501c554a 100644 --- a/dataflow/operators/core_text/generate/bench_answer_generator.py +++ b/dataflow/operators/core_text/generate/bench_answer_generator.py @@ -1,6 +1,5 @@ import json import inspect -import re from typing import Any, Dict, List, Literal, Optional, Union import numpy as np @@ -9,11 +8,14 @@ from dataflow import get_logger from dataflow.core import OperatorABC, LLMServingABC from dataflow.core.prompt import DIYPromptABC, prompt_restrict +from dataflow.prompts.core_text import FormatStrPrompt from dataflow.utils.registry import OPERATOR_REGISTRY from dataflow.utils.storage import DataFlowStorage -@prompt_restrict() # 保持通用, 不强绑固定 prompt 类 +@prompt_restrict( + FormatStrPrompt + ) @OPERATOR_REGISTRY.register() class BenchAnswerGenerator(OperatorABC): @@ -22,8 +24,8 @@ class BenchAnswerGenerator(OperatorABC): 输入: - eval_type: 评测类型, 取值同 evaluator - - keys_map: 指定各字段名, 同 evaluator - - context_key: 可选, 上下文字段名, 不传则 None + - 运行时通过 input_xxx_key 传入各字段名(未传默认 None) + - input_context_key: 可选, 上下文字段名, 不传则 None 输出: - output_key: 生成结果列, 默认 generated_ans - 对于不需要生成的类型, 默认不写 output_key, 直接返回空列表 @@ -40,7 +42,7 @@ def __init__( "key3_q_a_rejected", ] = "key2_qa", llm_serving: LLMServingABC = None, - prompt_template: DIYPromptABC = None, + prompt_template: Union[FormatStrPrompt, DIYPromptABC] = FormatStrPrompt, system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.", allow_overwrite: bool = False, force_generate: bool = False, @@ -135,23 +137,35 @@ def _build_prompt( if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"): try: fn = getattr(self.prompt_template, "build_prompt") + + if eval_type in ("key3_q_choices_a", "key3_q_choices_as"): + need_fields = {"question", "choices"} + else: + need_fields = {"question"} + kwargs = { "eval_type": eval_type, "question": question, - "context": context, + "context": context or "", "choices": choices, - "choices_text": self._format_choices_text(choices) if choices else None, + "choices_text": self._format_choices_text(choices) if choices else "", } - kwargs = {k: v for k, v in kwargs.items() if v is not None} sig = inspect.signature(fn) - params = sig.parameters.values() + params = list(sig.parameters.values()) has_varkw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params) + + accepted = {p.name for p in params if p.name != "self"} + expects_need_fields = "need_fields" in accepted + if has_varkw: + if expects_need_fields: + return fn(need_fields, **kwargs) return fn(**kwargs) - accepted = {p.name for p in params if p.name != "self"} filtered = {k: v for k, v in kwargs.items() if k in accepted} + if expects_need_fields: + return fn(need_fields, **filtered) return fn(**filtered) except Exception as e: self.logger.error(f"prompt_template.build_prompt 失败, fallback 默认模板: {e}") @@ -186,7 +200,15 @@ def _need_generation(self, eval_type: str) -> bool: def run( self, storage: DataFlowStorage, - input_keys_map: Dict[str, str], + input_text_key: Optional[str] = None, + input_question_key: Optional[str] = None, + input_target_key: Optional[str] = None, + input_targets_key: Optional[str] = None, + input_choices_key: Optional[str] = None, + input_label_key: Optional[str] = None, + input_labels_key: Optional[str] = None, + input_better_key: Optional[str] = None, + input_rejected_key: Optional[str] = None, input_context_key: Optional[str] = None, output_key: str = "generated_ans", ) -> List[str]: @@ -205,16 +227,16 @@ def run( return [] # 读取字段 - q_col = input_keys_map.get("question") + q_col = input_question_key if not q_col or q_col not in df.columns: - self.logger.error(f"缺少 question 列, keys_map.question={q_col}") + self.logger.error(f"缺少 question 列, input_question_key={q_col}") storage.write(df) return [] - ch_col = input_keys_map.get("choices") + ch_col = input_choices_key need_choices = eval_type in ("key3_q_choices_a", "key3_q_choices_as") if need_choices and (not ch_col or ch_col not in df.columns): - self.logger.error(f"缺少 choices 列, keys_map.choices={ch_col}") + self.logger.error(f"缺少 choices 列, input_choices_key={ch_col}") storage.write(df) return [] @@ -267,7 +289,15 @@ def get_desc(lang: str = "zh"): "- force_generate:是否强制对可生成类型都生成\n\n" "运行参数:\n" "- storage:DataFlowStorage\n" - "- input_keys_map:字段映射,至少包含 question;选择题需包含 choices\n" + "- input_text_key:文本列名(key1_text_score)\n" + "- input_question_key:问题列名(key2/key3)\n" + "- input_target_key:单个参考答案列名(key2_qa)\n" + "- input_targets_key:多个参考答案列名(key2_q_ma)\n" + "- input_choices_key:选项列名(key3_q_choices_a/key3_q_choices_as)\n" + "- input_label_key:单个标签列名(key3_q_choices_a)\n" + "- input_labels_key:多个标签列名(key3_q_choices_as)\n" + "- input_better_key:优选答案列名(key3_q_a_rejected)\n" + "- input_rejected_key:劣选答案列名(key3_q_a_rejected)\n" "- input_context_key:可选,上下文字段名\n" "- output_key:生成结果列名(默认 generated_ans)\n\n" "输出:\n" @@ -285,8 +315,16 @@ def get_desc(lang: str = "zh"): "- force_generate: Whether to force generation for types that can be skipped by default\n\n" "Run Parameters:\n" "- storage: DataFlowStorage\n" - "- keys_map: Column mapping (requires question; for choice tasks requires choices)\n" - "- context_key: Optional context column name\n" + "- input_text_key: Text column name (key1_text_score)\n" + "- input_question_key: Question column name (key2/key3)\n" + "- input_target_key: Single reference answer column name (key2_qa)\n" + "- input_targets_key: Multiple reference answers column name (key2_q_ma)\n" + "- input_choices_key: Choices column name (key3_q_choices_a/key3_q_choices_as)\n" + "- input_label_key: Single label column name (key3_q_choices_a)\n" + "- input_labels_key: Multiple labels column name (key3_q_choices_as)\n" + "- input_better_key: Better answer column name (key3_q_a_rejected)\n" + "- input_rejected_key: Rejected answer column name (key3_q_a_rejected)\n" + "- input_context_key: Optional context column name\n" "- output_key: Output column name for generated answers (default: generated_ans)\n\n" "Output Parameters:\n" "- Writes output_key into the dataframe when generation is performed\n" diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py index 13941c02..37aa98f5 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py @@ -13,22 +13,20 @@ "key3_q_a_rejected", """ -DIY_PROMPT_ANSWER = """Please output the answer.""" EVAL_TYPE = "key1_text_score" -KEY_MAPS = {"text": "text"} class UnifiedBenchEvalPipeline(): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl", + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type1.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) @@ -53,17 +51,16 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, + input_text_key="text", input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, + input_text_key="text", input_context_key=None, input_pred_key="generated_ans", - ) if __name__ == "__main__": diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py index 55f580df..605df4e0 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py @@ -1,4 +1,5 @@ from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.prompts.core_text import FormatStrPrompt from dataflow.core.prompt import DIYPromptABC from dataflow.utils.storage import FileStorage from dataflow.serving import LocalModelLLMServing_vllm @@ -15,39 +16,31 @@ """ EVAL_TYPE = "key2_qa" -KEY_MAPS = { - "question": "question", - "target": "golden_label" -} - -class AnswerGeneratePromptDIY(DIYPromptABC): - def build_prompt(self, question:str = None): - prompt = f""" - Question: {question} - Answer: - """ - return prompt class UnifiedBenchEvalPipeline(): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl", + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type2.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) + self.generation_prompt_template = FormatStrPrompt( + f_str_template="Question: {question}\nAnswer:", + ) + self.answer_generator_step1 = BenchAnswerGenerator( llm_serving=self.llm_serving_generator, eval_type=EVAL_TYPE, - prompt_template=AnswerGeneratePromptDIY(), + prompt_template=self.generation_prompt_template, allow_overwrite=False, force_generate=False, ) @@ -64,14 +57,16 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, + input_question_key="question", + input_target_key="golden_label", input_context_key=None, output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, + input_question_key="question", + input_target_key="golden_label", input_context_key=None, input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py index 6992e080..9be6f441 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py @@ -1,5 +1,6 @@ from dataflow.pipeline.Pipeline import PipelineABC from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.prompts.core_text import FormatStrPrompt from dataflow.core.prompt import DIYPromptABC from dataflow.utils.storage import FileStorage from dataflow.serving import LocalModelLLMServing_vllm @@ -16,41 +17,32 @@ """ EVAL_TYPE = "key2_q_ma" -KEY_MAPS = { - "context": "context", # optional - "question": "question", - "targets": "targets" -} - -class AnswerGeneratePromptDIY(DIYPromptABC): - def build_prompt(self, question:str = None): - prompt = f""" - Question: {question} - Answer: - """ - return prompt class UnifiedBenchEvalPipeline(PipelineABC): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): super().__init__() self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl", + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type3.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) + self.generation_prompt_template = FormatStrPrompt( + f_str_template="Question: {question}\nAnswer:", + ) + self.answer_generator_step1 = BenchAnswerGenerator( llm_serving=self.llm_serving_generator, eval_type=EVAL_TYPE, - prompt_template=AnswerGeneratePromptDIY(), + prompt_template=self.generation_prompt_template, allow_overwrite=False, force_generate=False, ) @@ -67,15 +59,17 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_targets_key="targets", output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_targets_key="targets", input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py index c84ccb4a..3716cc57 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py @@ -1,6 +1,6 @@ from dataflow.pipeline.Pipeline import PipelineABC from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator -from dataflow.core.prompt import DIYPromptABC +from dataflow.prompts.core_text import FormatStrPrompt from dataflow.utils.storage import FileStorage from dataflow.serving import LocalModelLLMServing_vllm from dataflow.core import LLMServingABC @@ -16,40 +16,32 @@ """ EVAL_TYPE = "key3_q_choices_a" -KEY_MAPS = { - "context": "context", # optional - "question": "question", - "choices": "choices", - "label": "label" -} - -class MMLUPromptDIY(DIYPromptABC): - def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs): - ctx = f"Context:\n{context}\n\n" if context else "" - return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:" - class UnifiedBenchEvalPipeline(PipelineABC): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): super().__init__() self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl", + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type4.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) + self.generation_prompt_template = FormatStrPrompt( + f_str_template="{context}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:", + ) + self.answer_generator_step1 = BenchAnswerGenerator( llm_serving=self.llm_serving_generator, eval_type=EVAL_TYPE, - prompt_template=MMLUPromptDIY(), + prompt_template=self.generation_prompt_template, allow_overwrite=False, force_generate=False, ) @@ -66,15 +58,18 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_choices_key="choices", output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_choices_key="choices", + input_label_key="label", input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py index 461c65b9..56705cae 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py @@ -1,6 +1,6 @@ from dataflow.pipeline.Pipeline import PipelineABC from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator -from dataflow.core.prompt import DIYPromptABC +from dataflow.prompts.core_text import FormatStrPrompt from dataflow.utils.storage import FileStorage from dataflow.serving import LocalModelLLMServing_vllm from dataflow.core import LLMServingABC @@ -16,40 +16,32 @@ """ EVAL_TYPE = "key3_q_choices_as" -KEY_MAPS = { - "context": "context", # optional - "question": "question", - "choices": "choices", - "labels": "labels" -} - -class MMLUPromptDIY(DIYPromptABC): - def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs): - ctx = f"Context:\n{context}\n\n" if context else "" - return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:" - class UnifiedBenchEvalPipeline(PipelineABC): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): super().__init__() self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl", + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type5.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) + self.generation_prompt_template = FormatStrPrompt( + f_str_template="{context}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:", + ) + self.answer_generator_step1 = BenchAnswerGenerator( llm_serving=self.llm_serving_generator, eval_type=EVAL_TYPE, - prompt_template=MMLUPromptDIY(), + prompt_template=self.generation_prompt_template, allow_overwrite=False, force_generate=False, ) @@ -66,15 +58,18 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_choices_key="choices", output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_choices_key="choices", + input_labels_key="labels", input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py index a5ff886c..041c6b14 100644 --- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py @@ -1,6 +1,6 @@ from dataflow.pipeline.Pipeline import PipelineABC from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator -from dataflow.core.prompt import DIYPromptABC +from dataflow.prompts.core_text import FormatStrPrompt from dataflow.utils.storage import FileStorage from dataflow.serving import LocalModelLLMServing_vllm from dataflow.core import LLMServingABC @@ -16,40 +16,32 @@ """ EVAL_TYPE = "key3_q_a_rejected" -KEY_MAPS = { - "context": "context", # optional - "question": "question", - "better": "better", - "rejected": "rejected" -} - -class PreferencePairwisePromptDIY(DIYPromptABC): - def build_prompt(self, question: str = None, context: str = None, **kwargs): - ctx = f"Context:\n{context}\n\n" if context else "" - return f"{ctx}Question:\n{question}\n\nAnswer:" - class UnifiedBenchEvalPipeline(PipelineABC): def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): super().__init__() self.storage = FileStorage( - first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl", + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type6.jsonl", cache_path="./cache_local", file_name_prefix="dataflow_cache_step", cache_type="jsonl", ) self.llm_serving_generator = LocalModelLLMServing_vllm( - hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path vllm_tensor_parallel_size=1, vllm_max_tokens=2048, ) + self.generation_prompt_template = FormatStrPrompt( + f_str_template="{context}Question:\n{question}\n\nAnswer:", + ) + self.answer_generator_step1 = BenchAnswerGenerator( llm_serving=self.llm_serving_generator, eval_type=EVAL_TYPE, - prompt_template=PreferencePairwisePromptDIY(), + prompt_template=self.generation_prompt_template, allow_overwrite=False, force_generate=False, ) @@ -66,15 +58,17 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg def forward(self): self.answer_generator_step1.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", output_key="generated_ans", ) self.evaluator_step2.run( storage=self.storage.step(), - input_keys_map=KEY_MAPS, - input_context_key=None, + input_context_key="context", + input_question_key="question", + input_better_key="better", + input_rejected_key="rejected", input_pred_key="generated_ans", ) diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py new file mode 100644 index 00000000..fae6fb81 --- /dev/null +++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py @@ -0,0 +1,83 @@ +from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator +from dataflow.prompts.core_text import FormatStrPrompt +from dataflow.core.prompt import DIYPromptABC +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm, APILLMServing_request +from dataflow.core import LLMServingABC + +""" +all types: +"key1_text_score", +"key2_qa", +"key2_q_ma", +"key3_q_choices_a", +"key3_q_choices_as", +"key3_q_a_rejected", +""" + +EVAL_TYPE = "key2_qa" + +class UnifiedBenchEvalPipeline(): + def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None): + + self.storage = FileStorage( + first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type2.jsonl", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="jsonl", + ) + + self.llm_serving_generator = LocalModelLLMServing_vllm( + hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path + vllm_tensor_parallel_size=1, + vllm_max_tokens=2048, + ) + + # use API server as LLM serving + self.llm_serving_judger = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=5 + ) + + self.generation_prompt_template = FormatStrPrompt( + f_str_template="Question: {question}\nAnswer:", + ) + + self.answer_generator_step1 = BenchAnswerGenerator( + llm_serving=self.llm_serving_generator, + eval_type=EVAL_TYPE, + prompt_template=self.generation_prompt_template, + allow_overwrite=False, + force_generate=False, + ) + + self.evaluator_step2 = UnifiedBenchDatasetEvaluator( + eval_result_path="./cache_local/eval_result/eval_result.jsonl", + llm_serving=self.llm_serving_judger, + eval_type=EVAL_TYPE, + prompt_template=None, + use_semantic_judge=True, + metric_type=None, # use default metric + ) + + def forward(self): + self.answer_generator_step1.run( + storage=self.storage.step(), + input_question_key="question", + input_target_key="golden_label", + input_context_key=None, + output_key="generated_ans", + ) + + self.evaluator_step2.run( + storage=self.storage.step(), + input_question_key="question", + input_target_key="golden_label", + input_context_key=None, + input_pred_key="generated_ans", + ) + +if __name__ == "__main__": + pl = UnifiedBenchEvalPipeline() + pl.forward()