From d3c7b251b40dd57c61fb8c798b0af881d875faee Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Mon, 5 Jan 2026 20:07:55 +0800
Subject: [PATCH 1/6] first version of new eval

---
 .../unified_bench_eval_type1.jsonl            |    6 +
 dataflow/operators/core_text/__init__.py      |    1 +
 .../eval/unified_bench_dataset_evaluator.py   | 1215 +++++++++++++++++
 .../generate/bench_answer_generator.py        |  250 ++++
 .../unified_bench_eval_pipeline.py            |   66 +
 5 files changed, 1538 insertions(+)
 create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type1.jsonl
 create mode 100644 dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
 create mode 100644 dataflow/operators/core_text/generate/bench_answer_generator.py
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py

diff --git a/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl
new file mode 100644
index 00000000..a76d9002
--- /dev/null
+++ b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl
@@ -0,0 +1,6 @@
+{"text": "This is a simple test sentence to measure perplexity for the first unified bench type."}
+{"text": "The capital of France is Paris."}
+{"text": "Please evaluate the language model perplexity on this short example."}
+{"text": "Machine learning enables computers to learn patterns from data."}
+{"text": "Perplexity is a common metric for evaluating language models on text scoring tasks."}
+{"text": "666233gigity"}
\ No newline at end of file
diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py
index 933fb105..2ed5a57e 100644
--- a/dataflow/operators/core_text/__init__.py
+++ b/dataflow/operators/core_text/__init__.py
@@ -8,6 +8,7 @@
     from .generate.text2multihopqa_generator import Text2MultiHopQAGenerator
     from .generate.embedding_generator import EmbeddingGenerator
     from .generate.retrieval_generator import RetrievalGenerator
+    from .generate.bench_answer_generator import BenchAnswerGenerator
     from .eval.bench_dataset_evaluator import BenchDatasetEvaluator
     from .eval.bench_dataset_evaluator_question import BenchDatasetEvaluatorQuestion
     from .eval.text2qa_sample_evaluator import Text2QASampleEvaluator
diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
new file mode 100644
index 00000000..6a09990b
--- /dev/null
+++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
@@ -0,0 +1,1215 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+import numpy as np
+import pandas as pd
+from math_verify import parse, verify
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.core.prompt import prompt_restrict
+from dataflow.prompts.model_evaluation.general import AnswerJudgePrompt
+from dataflow.core import LLMServingABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.reasoning.AnswerExtraction import StringCleaner, UnitTextManager, AnswerExtractor
+
+
+@prompt_restrict(
+    AnswerJudgePrompt
+)
+
+@OPERATOR_REGISTRY.register()
+class UnifiedBenchDatasetEvaluator(OperatorABC):
+    """
+    统一 Bench 评测算子：支持 6 类 keys-type + metric。
+
+    评测类型 (bench_dataflow_eval_type): (详见doc)
+      - key1_text_score
+      - key2_qa
+      - key2_q_ma
+      - key3_q_choices_a
+      - key3_q_choices_as
+      - key3_q_a_rejected
+
+    核心思想：
+      只需要传 bench_dataflow_eval_type + metric_type + keys_map + (可选) context_key
+      - evaluator 内部负责：
+          1) 读取 dataframe
+          2) 取 keys
+          3) 组装 prompt（用 prompt_template 或默认模板）
+          4) 计算 metric
+          5) 写回结果列 + 统计落盘
+    """
+
+    # -----------------------------
+    # 构造
+    # -----------------------------
+    def __init__(
+        self,
+        eval_result_path: Optional[str] = None,
+        eval_type: Literal[
+                "key1_text_score",
+                "key2_qa",
+                "key2_q_ma",
+                "key3_q_choices_a",
+                "key3_q_choices_as",
+                "key3_q_a_rejected",
+            ] = "key2_qa",
+        llm_serving: Optional[LLMServingABC] = None,
+        prompt_template: Union[AnswerJudgePrompt, DIYPromptABC] = AnswerJudgePrompt,
+        system_prompt: str = "You are a helpful assistant specialized in evaluating answer correctness.",
+        metric_type: Optional[str] = None,
+        use_semantic_judge: bool = False,
+    ):
+        if eval_result_path is None:
+            timestamp = int(time.time())
+            eval_result_path = f"result_bencheval/UnifiedBenchDatasetEvaluator_result_{timestamp}.json"
+
+        self.eval_result_path = eval_result_path
+        self.eval_type = eval_type
+        self.llm_serving = llm_serving
+        self.prompt_template = prompt_template
+        self.system_prompt = system_prompt
+        self.metric_type = metric_type
+        self.use_semantic_judge = use_semantic_judge
+
+        unit_manager = UnitTextManager()
+        string_cleaner = StringCleaner(unit_manager)
+        self.answer_extractor = AnswerExtractor(string_cleaner)
+
+        self.logger = get_logger()
+        self.empty_responses_count = 0
+
+    # -----------------------------
+    # 工具函数：列检查
+    # -----------------------------
+    def _check_columns(self, dataframe: pd.DataFrame, cols: List[str]) -> bool:
+        ok = True
+        for c in cols:
+            if c not in dataframe.columns:
+                self.logger.error(f"Required column '{c}' not found in dataframe")
+                ok = False
+        return ok
+
+    # -----------------------------
+    # 工具函数：context 统一拼接
+    # -----------------------------
+    def _normalize_context(self, ctx: Any) -> Optional[str]:
+        if ctx is None:
+            return None
+        if isinstance(ctx, float) and np.isnan(ctx):
+            return None
+        if isinstance(ctx, list):
+            parts = []
+            for x in ctx:
+                if x is None:
+                    continue
+                s = str(x).strip()
+                if s:
+                    parts.append(s)
+            return "\n".join(parts) if parts else None
+        s = str(ctx).strip()
+        return s if s else None
+
+    # -----------------------------
+    # 工具函数：默认 prompt（当 prompt_template 不存在或 build_prompt 不可用）
+    # -----------------------------
+    def _default_prompt(
+        self,
+        *,
+        question: Optional[str] = None,
+        context: Optional[str] = None,
+        text: Optional[str] = None,
+        choices: Optional[List[str]] = None,
+        task: str = "",
+    ) -> str:
+        if task == "text_score":
+            return (text or "").strip()
+
+        ctx_block = f"Context:\n{context}\n\n" if context else ""
+        q_block = f"Question:\n{(question or '').strip()}\n\n"
+
+        if choices is not None:
+            # 标准化成 A./B./C. 格式，便于模板替换 & 也便于 fallback 解析
+            letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            formatted = []
+            for i, ch in enumerate(choices):
+                tag = letters[i] if i < len(letters) else str(i)
+                formatted.append(f"{tag}. {str(ch)}")
+            choices_block = "Choices:\n" + "\n".join(formatted) + "\n\n"
+            return f"{ctx_block}{q_block}{choices_block}Answer:"
+        else:
+            return f"{ctx_block}{q_block}Answer:"
+
+    def _build_prompt(
+        self,
+        *,
+        question: Optional[str] = None,
+        context: Optional[str] = None,
+        text: Optional[str] = None,
+        choices: Optional[List[str]] = None,
+        task: str = "",
+    ) -> str:
+        # 兼容你的 prompt_template（通常有 build_prompt）
+        if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"):
+            try:
+                # 给模板更丰富的变量，模板不用可以忽略
+                choices_text = None
+                if choices is not None:
+                    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                    formatted = []
+                    for i, ch in enumerate(choices):
+                        tag = letters[i] if i < len(letters) else str(i)
+                        formatted.append(f"{tag}. {str(ch)}")
+                    choices_text = "\n".join(formatted)
+
+                return self.prompt_template.build_prompt(
+                    question=question,
+                    context=context,
+                    text=text,
+                    choices=choices,
+                    choices_text=choices_text,
+                    task=task,
+                )
+            except Exception as e:
+                self.logger.error(f"prompt_template.build_prompt failed, fallback to default. err={e}")
+
+        return self._default_prompt(question=question, context=context, text=text, choices=choices, task=task)
+
+    # -----------------------------
+    # math_verify compare
+    # -----------------------------
+    def _math_verify_compare(self, answer: Any, ground_truth: Any) -> bool:
+        try:
+            return verify(parse(str(ground_truth)), parse(str(answer)))
+        except Exception:
+            try:
+                return verify(parse(ground_truth), parse(answer))
+            except Exception:
+                return False
+
+    # -----------------------------
+    # 多参考答案：把 targets 解析成 List[str]
+    # -----------------------------
+    def _normalize_targets(self, targets: Any) -> List[str]:
+        if targets is None:
+            return []
+        if isinstance(targets, float) and np.isnan(targets):
+            return []
+        if isinstance(targets, list):
+            return [str(x) for x in targets if str(x).strip()]
+
+        s = str(targets).strip()
+        if not s:
+            return []
+
+        # 尝试 json list
+        if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")):
+            try:
+                obj = json.loads(s)
+                if isinstance(obj, list):
+                    return [str(x) for x in obj if str(x).strip()]
+            except Exception:
+                pass
+
+        # 常见分隔
+        if "||" in s:
+            parts = [p.strip() for p in s.split("||")]
+        elif "|" in s:
+            parts = [p.strip() for p in s.split("|")]
+        elif ";" in s:
+            parts = [p.strip() for p in s.split(";")]
+        else:
+            parts = [s]
+        return [p for p in parts if p]
+
+    # -----------------------------
+    # choice 解析（fallback 用）
+    # -----------------------------
+    def _parse_choice_from_text(self, text: str, num_choices: int) -> Optional[int]:
+        if text is None:
+            return None
+        t = str(text).strip()
+        if not t:
+            return None
+
+        # 先找 A/B/C...
+        m = re.search(r"\b([A-Za-z])\b", t)
+        if m:
+            idx = ord(m.group(1).upper()) - ord("A")
+            if 0 <= idx < num_choices:
+                return idx
+
+        # 再找数字（1-based 或 0-based 都兼容）
+        m = re.search(r"\b(\d+)\b", t)
+        if m:
+            val = int(m.group(1))
+            if 0 <= val < num_choices:
+                return val
+            if 1 <= val <= num_choices:
+                return val - 1
+
+        return None
+
+    def _parse_multiselect_set(self, text: str, num_choices: int) -> Optional[set]:
+        if text is None:
+            return None
+        s = str(text).strip()
+        if not s:
+            return None
+
+        # json list
+        if s.startswith("[") and s.endswith("]"):
+            try:
+                obj = json.loads(s)
+                if isinstance(obj, list):
+                    res = set()
+                    for x in obj:
+                        if isinstance(x, str):
+                            x = x.strip()
+                            if len(x) == 1 and x.isalpha():
+                                idx = ord(x.upper()) - ord("A")
+                                if 0 <= idx < num_choices:
+                                    res.add(idx)
+                            elif x.isdigit():
+                                v = int(x)
+                                if 0 <= v < num_choices:
+                                    res.add(v)
+                                elif 1 <= v <= num_choices:
+                                    res.add(v - 1)
+                        elif isinstance(x, int):
+                            if 0 <= x < num_choices:
+                                res.add(x)
+                            elif 1 <= x <= num_choices:
+                                res.add(x - 1)
+                    return res
+            except Exception:
+                pass
+
+        # 字母集合：如 "A,C,D" / "B D"
+        letters = re.findall(r"\b([A-Za-z])\b", s)
+        if letters:
+            res = set()
+            for ch in letters:
+                idx = ord(ch.upper()) - ord("A")
+                if 0 <= idx < num_choices:
+                    res.add(idx)
+            return res if res else None
+
+        # 数字集合：如 "1,3,4"
+        nums = re.findall(r"\b(\d+)\b", s)
+        if nums:
+            res = set()
+            for n in nums:
+                v = int(n)
+                if 0 <= v < num_choices:
+                    res.add(v)
+                elif 1 <= v <= num_choices:
+                    res.add(v - 1)
+            return res if res else None
+
+        return None
+
+    # -----------------------------
+    # micro-F1 / Jaccard
+    # -----------------------------
+    def _set_metrics(self, pred: set, gold: set) -> Dict[str, float]:
+        if pred is None or gold is None:
+            return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "jaccard": 0.0, "exact_set": 0.0}
+        inter = len(pred & gold)
+        p = inter / len(pred) if len(pred) > 0 else 0.0
+        r = inter / len(gold) if len(gold) > 0 else 0.0
+        f1 = (2 * p * r / (p + r)) if (p + r) > 0 else 0.0
+        j = inter / len(pred | gold) if len(pred | gold) > 0 else 0.0
+        exact = 1.0 if pred == gold else 0.0
+        return {"precision": float(p), "recall": float(r), "f1": float(f1), "jaccard": float(j), "exact_set": float(exact)}
+
+    # -----------------------------
+    # LLM loglikelihood 适配（尽量兼容不同 serving 实现）
+    # -----------------------------
+    def _ll_batch(self, prompts: List[str], continuations: List[str]) -> Optional[List[float]]:
+        if self.llm_serving is None:
+            return None
+
+        # 尝试常见方法名
+        cand_names = [
+            "loglikelihood_batch",
+            "loglikelihood",
+            "get_loglikelihood_batch",
+            "get_loglikelihood",
+            "score_batch",
+            "score",
+        ]
+        for name in cand_names:
+            if hasattr(self.llm_serving, name):
+                fn = getattr(self.llm_serving, name)
+                try:
+                    # 兼容多种签名： (prompts, continuations) / (pairs)
+                    try:
+                        return fn(prompts=prompts, continuations=continuations)  # type: ignore
+                    except TypeError:
+                        try:
+                            return fn(prompts, continuations)  # type: ignore
+                        except TypeError:
+                            pairs = list(zip(prompts, continuations))
+                            return fn(pairs)  # type: ignore
+                except Exception as e:
+                    self.logger.error(f"llm_serving.{name} failed: {e}")
+                    return None
+
+        self.logger.error("llm_serving does not provide any loglikelihood/score interface.")
+        return None
+
+    def _ppl_batch(self, texts: List[str]) -> Optional[List[float]]:
+        if self.llm_serving is None:
+            return None
+
+        model_id = getattr(self.llm_serving, "real_model_path", None) or getattr(self.llm_serving, "hf_model_name_or_path", None)
+        hf_cache_dir = getattr(self.llm_serving, "hf_cache_dir", None)
+        trust_remote_code = getattr(self.llm_serving, "trust_remote_code", True)
+
+        if model_id is None:
+            self.logger.error("llm_serving does not expose real_model_path/hf_model_name_or_path; cannot compute ppl.")
+            return None
+
+        try:
+            tokenizer = getattr(self, "_ppl_hf_tokenizer", None)
+            model = getattr(self, "_ppl_hf_model", None)
+            loaded_id = getattr(self, "_ppl_hf_model_id", None)
+            if tokenizer is None or model is None or loaded_id != model_id:
+                tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code)
+                model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code)
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                model.to(device)
+                model.eval()
+                self._ppl_hf_tokenizer = tokenizer
+                self._ppl_hf_model = model
+                self._ppl_hf_model_id = model_id
+        except Exception as e:
+            self.logger.error(f"failed to load hf model/tokenizer for ppl: {e}")
+            return None
+
+        try:
+            device = next(model.parameters()).device
+            batch_size = 4
+            ppls: List[float] = []
+            max_len = getattr(getattr(model, "config", None), "max_position_embeddings", None)
+
+            for start in range(0, len(texts), batch_size):
+                batch_texts = ["" if t is None else str(t) for t in texts[start:start + batch_size]]
+                enc = tokenizer(
+                    batch_texts,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=max_len,
+                )
+                input_ids = enc["input_ids"].to(device)
+                attention_mask = enc.get("attention_mask", None)
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(device)
+
+                with torch.no_grad():
+                    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
+
+                shift_logits = logits[:, :-1, :].contiguous()
+                shift_labels = input_ids[:, 1:].contiguous()
+
+                if attention_mask is None:
+                    shift_mask = torch.ones_like(shift_labels, dtype=torch.float32, device=device)
+                else:
+                    shift_mask = attention_mask[:, 1:].to(dtype=torch.float32)
+
+                vocab_size = shift_logits.size(-1)
+                token_nll = F.cross_entropy(
+                    shift_logits.view(-1, vocab_size),
+                    shift_labels.view(-1),
+                    reduction="none",
+                ).view(shift_labels.size(0), -1)
+
+                nll_sum = (token_nll * shift_mask).sum(dim=1)
+                denom = shift_mask.sum(dim=1).clamp_min(1.0)
+                ppl_batch = torch.exp(nll_sum / denom).detach().cpu().tolist()
+                ppls.extend([float(x) for x in ppl_batch])
+
+            return ppls
+        except Exception as e:
+            self.logger.error(f"hf ppl computation failed: {e}")
+            return None
+
+    # -----------------------------
+    # 统计落盘
+    # -----------------------------
+    def _save_stats(self, bench_name_or_prefix: str, stats: Dict[str, Any]) -> None:
+        os.makedirs(os.path.dirname(self.eval_result_path), exist_ok=True)
+        df = pd.DataFrame([stats])
+        df.to_json(self.eval_result_path, orient="records", force_ascii=False, indent=2)
+        self.logger.success(f"Statistics saved to {self.eval_result_path}")
+
+    # -----------------------------
+    # 主入口
+    # -----------------------------
+    def run(
+        self,
+        storage: DataFlowStorage,
+        keys_map: Optional[Dict[str, str]] = None,
+        context_key: Optional[str] = None,
+        input_pred_key: str = "generated_ans",
+    ) -> List[str]:
+        """
+        keys_map 示例：
+          - key1_text_score: {"text": "text"}
+          - key2_qa: {"question":"question", "target":"golden_answer"}
+          - key2_q_ma: {"question":"question", "targets":"gold_answers"}
+          - key3_q_choices_a: {"question":"question", "choices":"choices", "label":"label"}
+          - key3_q_choices_as: {"question":"question", "choices":"choices", "labels":"labels"}
+          - key3_q_a_rejected: {"question":"question", "better":"chosen", "rejected":"rejected"}
+        """
+        df = storage.read("dataframe")
+        eval_type = self.eval_type
+
+
+
+        # 输出列统一
+        if "eval_valid" not in df.columns:
+            df["eval_valid"] = True
+        df["eval_error"] = ""
+        df["eval_pred"] = None
+        df["eval_score"] = np.nan  # 数值型评分（accuracy 类用 0/1）
+
+        # 默认 metric
+        metric_type = self.metric_type
+        if metric_type is None:
+            metric_type = self._default_metric_for_type(eval_type, self.use_semantic_judge)
+
+        if keys_map is None:
+            self.logger.error("keys_map is required.")
+            storage.write(df)
+            return ["eval_valid", "eval_error", "eval_pred", "eval_score"]
+
+        # context 处理：统一读一列（可无）
+        ctx_series = None
+        if context_key is not None:
+            if context_key not in df.columns:
+                self.logger.error(f"context_key '{context_key}' not found; treat as None.")
+            else:
+                ctx_series = df[context_key]
+
+        # 分发
+        if eval_type == "key1_text_score":
+            required = [keys_map.get("text", "")]
+            if not self._check_columns(df, required):
+                storage.write(df)
+                return required
+
+            text_col = keys_map["text"]
+            texts = [str(x) if x is not None else "" for x in df[text_col].tolist()]
+            ppl = self._ppl_batch(texts)
+            if ppl is None:
+                df["eval_valid"] = False
+                df["eval_error"] = "ppl_unavailable"
+                storage.write(df)
+                self._save_stats(storage.file_name_prefix, {
+                    "bench_name_or_prefix": storage.file_name_prefix,
+                    "type": eval_type,
+                    "metric": metric_type,
+                    "total_samples": len(df),
+                    "valid_samples": 0,
+                    "note": "ppl unavailable in llm_serving",
+                })
+                return [text_col, "eval_score", "eval_valid", "eval_error"]
+
+            df["eval_score"] = ppl
+            df["eval_pred"] = None
+            df["eval_valid"] = True
+            storage.write(df)
+
+            stats = {
+                "bench_name_or_prefix": storage.file_name_prefix,
+                "type": eval_type,
+                "metric": metric_type,
+                "total_samples": int(len(df)),
+                "valid_samples": int(len(df)),
+                "ppl_mean": float(np.mean(ppl)) if len(ppl) else 0.0,
+            }
+            self._save_stats(storage.file_name_prefix, stats)
+            return [text_col, "eval_score", "eval_valid", "eval_error"]
+
+        elif eval_type in ("key2_qa", "key2_q_ma"):
+            # QA：默认走 math_verify 抽取+对比（可选 semantic_judge）
+            # 单参考：target
+            # 多参考：targets
+            question_col = keys_map.get("question", "")
+            if eval_type == "key2_qa":
+                target_col = keys_map.get("target", "")
+                required = [question_col, target_col, input_pred_key]
+                if not self._check_columns(df, required):
+                    storage.write(df)
+                    return required
+
+                self._eval_qa_single(
+                    df=df,
+                    question_col=question_col,
+                    target_col=target_col,
+                    pred_col=input_pred_key,
+                    ctx_series=ctx_series,
+                    metric_type=metric_type,
+                )
+                storage.write(df)
+
+                stats = self._stats_for_binary(df)
+                stats.update({
+                    "bench_name_or_prefix": storage.file_name_prefix,
+                    "type": eval_type,
+                    "metric": metric_type,
+                })
+                self._save_stats(storage.file_name_prefix, stats)
+                return [question_col, target_col, input_pred_key, "eval_score", "eval_valid", "eval_error"]
+
+            else:
+                targets_col = keys_map.get("targets", "")
+                required = [question_col, targets_col, input_pred_key]
+                if not self._check_columns(df, required):
+                    storage.write(df)
+                    return required
+
+                self._eval_qa_multi(
+                    df=df,
+                    question_col=question_col,
+                    targets_col=targets_col,
+                    pred_col=input_pred_key,
+                    ctx_series=ctx_series,
+                    metric_type=metric_type,
+                )
+                storage.write(df)
+
+                stats = self._stats_for_binary(df)
+                stats.update({
+                    "bench_name_or_prefix": storage.file_name_prefix,
+                    "type": eval_type,
+                    "metric": metric_type,
+                })
+                self._save_stats(storage.file_name_prefix, stats)
+                return [question_col, targets_col, input_pred_key, "eval_score", "eval_valid", "eval_error"]
+
+        elif eval_type == "key3_q_choices_a":
+            question_col = keys_map.get("question", "")
+            choices_col = keys_map.get("choices", "")
+            label_col = keys_map.get("label", "")
+            required = [question_col, choices_col, label_col]
+            # 若没有 llm_serving，则 fallback 需要 pred_col
+            if self.llm_serving is None:
+                required.append(input_pred_key)
+
+            if not self._check_columns(df, required):
+                storage.write(df)
+                return required
+
+            self._eval_mc_single(
+                df=df,
+                question_col=question_col,
+                choices_col=choices_col,
+                label_col=label_col,
+                ctx_series=ctx_series,
+                metric_type=metric_type,
+                pred_col=input_pred_key,
+            )
+            storage.write(df)
+
+            stats = self._stats_for_binary(df)
+            stats.update({
+                "bench_name_or_prefix": storage.file_name_prefix,
+                "type": eval_type,
+                "metric": metric_type,
+            })
+            self._save_stats(storage.file_name_prefix, stats)
+            return [question_col, choices_col, label_col, "eval_score", "eval_valid", "eval_error"]
+
+        elif eval_type == "key3_q_choices_as":
+            question_col = keys_map.get("question", "")
+            choices_col = keys_map.get("choices", "")
+            labels_col = keys_map.get("labels", "")
+            required = [question_col, choices_col, labels_col, input_pred_key]  # 先按“解析模型输出集合”实现
+            if not self._check_columns(df, required):
+                storage.write(df)
+                return required
+
+            self._eval_mc_multi(
+                df=df,
+                question_col=question_col,
+                choices_col=choices_col,
+                labels_col=labels_col,
+                pred_col=input_pred_key,
+                metric_type=metric_type,
+            )
+            storage.write(df)
+
+            stats = self._stats_for_multiselect(df)
+            stats.update({
+                "bench_name_or_prefix": storage.file_name_prefix,
+                "type": eval_type,
+                "metric": metric_type,
+            })
+            self._save_stats(storage.file_name_prefix, stats)
+            return [question_col, choices_col, labels_col, input_pred_key, "eval_score", "eval_valid", "eval_error"]
+
+        elif eval_type == "key3_q_a_rejected":
+            question_col = keys_map.get("question", "")
+            better_col = keys_map.get("better", "")
+            rejected_col = keys_map.get("rejected", "")
+            required = [question_col, better_col, rejected_col]
+            if not self._check_columns(df, required):
+                storage.write(df)
+                return required
+
+            if self.llm_serving is None:
+                # 这个类型没有 pred_col 可 fallback，只能报错
+                df["eval_valid"] = False
+                df["eval_error"] = "llm_serving_required_for_pairwise"
+                storage.write(df)
+                stats = {
+                    "bench_name_or_prefix": storage.file_name_prefix,
+                    "type": eval_type,
+                    "metric": metric_type,
+                    "total_samples": int(len(df)),
+                    "valid_samples": 0,
+                    "note": "pairwise requires llm_serving loglikelihood",
+                }
+                self._save_stats(storage.file_name_prefix, stats)
+                return required + ["eval_score", "eval_valid", "eval_error"]
+
+            self._eval_pairwise(
+                df=df,
+                question_col=question_col,
+                better_col=better_col,
+                rejected_col=rejected_col,
+                ctx_series=ctx_series,
+                metric_type=metric_type,
+            )
+            storage.write(df)
+
+            stats = self._stats_for_binary(df)
+            stats.update({
+                "bench_name_or_prefix": storage.file_name_prefix,
+                "type": eval_type,
+                "metric": metric_type,
+            })
+            self._save_stats(storage.file_name_prefix, stats)
+            return required + ["eval_score", "eval_valid", "eval_error"]
+
+        else:
+            self.logger.error(f"Unknown bench_dataflow_eval_type: {eval_type}")
+            storage.write(df)
+            return ["eval_valid", "eval_error", "eval_pred", "eval_score"]
+
+    # -----------------------------
+    # 默认 metric
+    # -----------------------------
+    def _default_metric_for_type(self, t: str, use_semantic_judge: bool) -> str:
+        if t == "key1_text_score":
+            return "ppl"
+        if t == "key2_qa":
+            return "semantic_judge" if use_semantic_judge else "math_verify"
+        if t == "key2_q_ma":
+            return "any_math_verify"
+        if t == "key3_q_choices_a":
+            return "ll_choice_acc"
+        if t == "key3_q_choices_as":
+            return "micro_f1"
+        if t == "key3_q_a_rejected":
+            return "pairwise_ll_winrate"
+        return "unknown"
+
+    # -----------------------------
+    # 统计：binary（0/1）
+    # -----------------------------
+    def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]:
+        total = len(df)
+        valid_mask = df["eval_valid"] == True
+        valid = int(valid_mask.sum())
+        # eval_score: 0/1
+        if valid > 0:
+            acc = float(df.loc[valid_mask, "eval_score"].mean())
+        else:
+            acc = 0.0
+        return {
+            "total_samples": int(total),
+            "valid_samples": int(valid),
+            "accuracy": float(acc),
+        }
+
+    # -----------------------------
+    # 统计：多选（f1/jaccard 等）
+    # -----------------------------
+    def _stats_for_multiselect(self, df: pd.DataFrame) -> Dict[str, Any]:
+        total = len(df)
+        valid_mask = df["eval_valid"] == True
+        valid = int(valid_mask.sum())
+        # eval_score 默认存 f1
+        if valid > 0:
+            f1_mean = float(df.loc[valid_mask, "eval_score"].mean())
+        else:
+            f1_mean = 0.0
+        # 如果你想要更多维度（jaccard/exact_set），可以从 eval_pred 里扩展存 dict，这里先给最小
+        return {
+            "total_samples": int(total),
+            "valid_samples": int(valid),
+            "micro_f1_mean": float(f1_mean),
+        }
+
+    # -----------------------------
+    # key2_qa：单参考
+    # -----------------------------
+    def _eval_qa_single(
+        self,
+        df: pd.DataFrame,
+        question_col: str,
+        target_col: str,
+        pred_col: str,
+        ctx_series: Optional[pd.Series],
+        metric_type: str,
+    ) -> None:
+        if metric_type == "semantic_judge":
+            # 语义 judge 需要 llm_serving.generate_from_input
+            if self.llm_serving is None or not hasattr(self.llm_serving, "generate_from_input"):
+                self.logger.error("semantic_judge requires llm_serving.generate_from_input")
+                df["eval_valid"] = False
+                df["eval_error"] = "semantic_judge_unavailable"
+                return
+
+            # 默认用“预测 vs 标准”直接 judge（你旧逻辑那套需要特定 Prompt，这里只做通用；你可自行替换为你自己的 AnswerJudgePrompt）
+            inputs = []
+            row_indices = []
+            for idx, row in df.iterrows():
+                gt = row[target_col]
+                pred = row[pred_col]
+                if gt is None or (isinstance(gt, str) and gt.strip() == ""):
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "empty_reference"
+                    continue
+                if pred is None or (isinstance(pred, str) and pred.strip() == ""):
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "empty_prediction"
+                    continue
+
+                prompt = (
+                    "You are an evaluator. Decide if the prediction is correct given the reference.\n"
+                    f"Reference:\n{gt}\n\nPrediction:\n{pred}\n\n"
+                    'Return JSON: {"judgement_result": true/false}'
+                )
+                inputs.append(prompt)
+                row_indices.append(idx)
+
+            if not inputs:
+                return
+
+            try:
+                responses = self.llm_serving.generate_from_input(user_inputs=inputs, system_prompt=self.system_prompt)
+            except Exception as e:
+                self.logger.error(f"semantic_judge generate_from_input failed: {e}")
+                for idx in row_indices:
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "semantic_judge_failed"
+                return
+
+            for idx, resp in zip(row_indices, responses):
+                ok = self._resolve_judge_response(resp)
+                df.at[idx, "eval_score"] = 1.0 if ok else 0.0
+                df.at[idx, "eval_pred"] = None
+                df.at[idx, "eval_valid"] = True
+                df.at[idx, "eval_error"] = ""
+
+            return
+
+        # 默认：math_verify
+        for idx, row in df.iterrows():
+            gt = row[target_col]
+            pred_raw = row[pred_col]
+            if gt is None or (isinstance(gt, str) and gt.strip() == ""):
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_reference"
+                continue
+            if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""):
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_prediction"
+                continue
+
+            final_answer = self.answer_extractor.extract_answer(pred_raw, None)
+            ok = self._math_verify_compare(final_answer, gt)
+            df.at[idx, "eval_score"] = 1.0 if ok else 0.0
+            df.at[idx, "eval_pred"] = str(final_answer)
+            df.at[idx, "eval_valid"] = True
+            df.at[idx, "eval_error"] = ""
+
+    # -----------------------------
+    # key2_q_ma：多参考
+    # -----------------------------
+    def _eval_qa_multi(
+        self,
+        df: pd.DataFrame,
+        question_col: str,
+        targets_col: str,
+        pred_col: str,
+        ctx_series: Optional[pd.Series],
+        metric_type: str,
+    ) -> None:
+        # 默认：any_math_verify
+        for idx, row in df.iterrows():
+            targets_raw = row[targets_col]
+            pred_raw = row[pred_col]
+            targets = self._normalize_targets(targets_raw)
+
+            if len(targets) == 0:
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_references"
+                continue
+            if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""):
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_prediction"
+                continue
+
+            final_answer = self.answer_extractor.extract_answer(pred_raw, None)
+            ok_any = False
+            for gt in targets:
+                if self._math_verify_compare(final_answer, gt):
+                    ok_any = True
+                    break
+
+            df.at[idx, "eval_score"] = 1.0 if ok_any else 0.0
+            df.at[idx, "eval_pred"] = str(final_answer)
+            df.at[idx, "eval_valid"] = True
+            df.at[idx, "eval_error"] = ""
+
+    # -----------------------------
+    # key3_q_choices_a：单选
+    # -----------------------------
+    def _eval_mc_single(
+        self,
+        df: pd.DataFrame,
+        question_col: str,
+        choices_col: str,
+        label_col: str,
+        ctx_series: Optional[pd.Series],
+        metric_type: str,
+        pred_col: str,
+    ) -> None:
+        # 优先：loglikelihood
+        if metric_type == "ll_choice_acc" and self.llm_serving is not None:
+            # 批量做：每行要对 choices 逐个算 ll，先实现清晰版（你后面可优化 batching）
+            for idx, row in df.iterrows():
+                q = row[question_col]
+                choices = row[choices_col]
+                label = row[label_col]
+
+                if choices is None or (isinstance(choices, float) and np.isnan(choices)):
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "empty_choices"
+                    continue
+                if not isinstance(choices, list):
+                    # 尝试 json
+                    try:
+                        choices = json.loads(str(choices))
+                    except Exception:
+                        df.at[idx, "eval_valid"] = False
+                        df.at[idx, "eval_error"] = "choices_not_list"
+                        continue
+                if len(choices) == 0:
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "empty_choices"
+                    continue
+
+                ctx = None
+                if ctx_series is not None:
+                    ctx = self._normalize_context(ctx_series.loc[idx])
+
+                prompt = self._build_prompt(question=str(q), context=ctx, choices=[str(c) for c in choices], task="mc_single")
+
+                # label 规范化为 idx
+                gold_idx = self._normalize_label_to_index(label, len(choices))
+                if gold_idx is None:
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "invalid_label"
+                    continue
+
+                prompts = [prompt] * len(choices)
+                conts = []
+                for c in choices:
+                    c_str = str(c)
+                    # 常见做法：continuation 前补空格，避免直接拼在 Answer: 后面太粘连
+                    conts.append((" " + c_str) if (len(prompt) > 0 and not prompt.endswith((" ", "\n"))) else c_str)
+
+                lls = self._ll_batch(prompts, conts)
+                if lls is None:
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "ll_unavailable"
+                    continue
+
+                pred_idx = int(np.argmax(np.array(lls)))
+                df.at[idx, "eval_pred"] = int(pred_idx)
+                df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0
+                df.at[idx, "eval_valid"] = True
+                df.at[idx, "eval_error"] = ""
+
+            return
+
+        # fallback：从 pred_col 解析（generation 输出里抓 A/B/C 或数字）
+        self.logger.warning("ll_choice_acc unavailable; fallback to parse generated output for single-choice.")
+        for idx, row in df.iterrows():
+            choices = row[choices_col]
+            label = row[label_col]
+            pred_text = row[pred_col] if pred_col in df.columns else None
+
+            if choices is None:
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_choices"
+                continue
+            if not isinstance(choices, list):
+                try:
+                    choices = json.loads(str(choices))
+                except Exception:
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "choices_not_list"
+                    continue
+
+            gold_idx = self._normalize_label_to_index(label, len(choices))
+            pred_idx = self._parse_choice_from_text(str(pred_text), len(choices)) if pred_text is not None else None
+            if gold_idx is None or pred_idx is None:
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "parse_failed"
+                continue
+
+            df.at[idx, "eval_pred"] = int(pred_idx)
+            df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0
+            df.at[idx, "eval_valid"] = True
+            df.at[idx, "eval_error"] = ""
+
+    def _normalize_label_to_index(self, label: Any, n: int) -> Optional[int]:
+        if label is None:
+            return None
+        # 若 label 本身是 int
+        if isinstance(label, (int, np.integer)):
+            v = int(label)
+            if 0 <= v < n:
+                return v
+            if 1 <= v <= n:
+                return v - 1
+            return None
+        s = str(label).strip()
+        if not s:
+            return None
+        # A/B/C
+        if len(s) == 1 and s.isalpha():
+            idx = ord(s.upper()) - ord("A")
+            return idx if 0 <= idx < n else None
+        # 数字
+        if s.isdigit():
+            v = int(s)
+            if 0 <= v < n:
+                return v
+            if 1 <= v <= n:
+                return v - 1
+        return None
+
+    # -----------------------------
+    # key3_q_choices_as：多选
+    # -----------------------------
+    def _eval_mc_multi(
+        self,
+        df: pd.DataFrame,
+        question_col: str,
+        choices_col: str,
+        labels_col: str,
+        pred_col: str,
+        metric_type: str,
+    ) -> None:
+        # 这里按你说的“先最小落地”：从 pred_col 解析集合 -> micro_f1
+        for idx, row in df.iterrows():
+            choices = row[choices_col]
+            gold = row[labels_col]
+            pred_text = row[pred_col]
+
+            if choices is None:
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_choices"
+                continue
+            if not isinstance(choices, list):
+                try:
+                    choices = json.loads(str(choices))
+                except Exception:
+                    df.at[idx, "eval_valid"] = False
+                    df.at[idx, "eval_error"] = "choices_not_list"
+                    continue
+
+            n = len(choices)
+            gold_set = self._normalize_multilabel_to_set(gold, n)
+            pred_set = self._parse_multiselect_set(str(pred_text), n)
+
+            if gold_set is None or pred_set is None:
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "parse_failed"
+                continue
+
+            m = self._set_metrics(pred_set, gold_set)
+            # eval_score 默认存 f1（你的上层聚合最常用）
+            df.at[idx, "eval_score"] = float(m["f1"])
+            # eval_pred 存更丰富的信息，便于 debug
+            df.at[idx, "eval_pred"] = json.dumps(
+                {"pred_set": sorted(list(pred_set)), "gold_set": sorted(list(gold_set)), **m},
+                ensure_ascii=False,
+            )
+            df.at[idx, "eval_valid"] = True
+            df.at[idx, "eval_error"] = ""
+
+    def _normalize_multilabel_to_set(self, labels: Any, n: int) -> Optional[set]:
+        if labels is None:
+            return None
+        if isinstance(labels, float) and np.isnan(labels):
+            return None
+        if isinstance(labels, list):
+            s = set()
+            for x in labels:
+                idx = self._normalize_label_to_index(x, n)
+                if idx is None:
+                    continue
+                s.add(idx)
+            return s if len(s) > 0 else set()
+
+        s = str(labels).strip()
+        if not s:
+            return None
+        # json list
+        if s.startswith("[") and s.endswith("]"):
+            try:
+                obj = json.loads(s)
+                if isinstance(obj, list):
+                    res = set()
+                    for x in obj:
+                        idx = self._normalize_label_to_index(x, n)
+                        if idx is not None:
+                            res.add(idx)
+                    return res
+            except Exception:
+                pass
+
+        # 分隔符
+        parts = re.split(r"[,\s;/|]+", s)
+        res = set()
+        for p in parts:
+            p = p.strip()
+            if not p:
+                continue
+            idx = self._normalize_label_to_index(p, n)
+            if idx is not None:
+                res.add(idx)
+        return res if len(res) > 0 else set()
+
+    # -----------------------------
+    # key3_q_a_rejected：偏好对比
+    # -----------------------------
+    def _eval_pairwise(
+        self,
+        df: pd.DataFrame,
+        question_col: str,
+        better_col: str,
+        rejected_col: str,
+        ctx_series: Optional[pd.Series],
+        metric_type: str,
+    ) -> None:
+        # 默认：pairwise_ll_winrate
+        for idx, row in df.iterrows():
+            q = row[question_col]
+            better = row[better_col]
+            rej = row[rejected_col]
+
+            if better is None or (isinstance(better, str) and better.strip() == ""):
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_better"
+                continue
+            if rej is None or (isinstance(rej, str) and rej.strip() == ""):
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "empty_rejected"
+                continue
+
+            ctx = None
+            if ctx_series is not None:
+                ctx = self._normalize_context(ctx_series.loc[idx])
+
+            prompt = self._build_prompt(question=str(q), context=ctx, task="pairwise")
+
+            prompts = [prompt, prompt]
+            conts = []
+            better_s = str(better)
+            rej_s = str(rej)
+            conts.append((" " + better_s) if (len(prompt) > 0 and not prompt.endswith((" ", "\n"))) else better_s)
+            conts.append((" " + rej_s) if (len(prompt) > 0 and not prompt.endswith((" ", "\n"))) else rej_s)
+
+            lls = self._ll_batch(prompts, conts)
+            if lls is None or len(lls) != 2:
+                df.at[idx, "eval_valid"] = False
+                df.at[idx, "eval_error"] = "ll_unavailable"
+                continue
+
+            win = 1.0 if float(lls[0]) > float(lls[1]) else 0.0
+            df.at[idx, "eval_score"] = win
+            df.at[idx, "eval_pred"] = json.dumps({"ll_better": float(lls[0]), "ll_rejected": float(lls[1])}, ensure_ascii=False)
+            df.at[idx, "eval_valid"] = True
+            df.at[idx, "eval_error"] = ""
+
+    # -----------------------------
+    # 语义 judge 响应解析（兼容你旧逻辑）
+    # -----------------------------
+    def _resolve_judge_response(self, response: Any) -> bool:
+        if response is None or (isinstance(response, str) and response.strip() == ""):
+            self.empty_responses_count += 1
+            return False
+        try:
+            s = str(response)
+            # 尝试 json
+            try:
+                obj = json.loads(s)
+                if isinstance(obj, dict) and "judgement_result" in obj:
+                    return bool(obj["judgement_result"])
+            except Exception:
+                pass
+
+            pattern = re.compile(r'"judgement_result"\s*:\s*(true|false)', re.IGNORECASE)
+            m = pattern.search(s)
+            if m:
+                return m.group(1).lower() == "true"
+            # fallback
+            return ("true" in s.lower()) and ("false" not in s.lower())
+        except Exception as e:
+            self.logger.error(f"Response format error: {response}. Error: {e}")
+            return False
+
+    # -----------------------------
+    # 描述
+    # -----------------------------
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "统一 Bench 评测算子：支持 6 类纯文本评测范式。\n\n"
+                "支持类型：\n"
+                "- key1_text_score（默认 ppl）\n"
+                "- key2_qa（默认 math_verify / 可选 semantic_judge）\n"
+                "- key2_q_ma（默认 any_math_verify）\n"
+                "- key3_q_choices_a（默认 ll_choice_acc，若无 ll 接口则 fallback 解析生成）\n"
+                "- key3_q_choices_as（默认 micro_f1：解析多选集合后算 F1）\n"
+                "- key3_q_a_rejected（默认 pairwise_ll_winrate）\n\n"
+                "统一输出列：eval_score / eval_pred / eval_valid / eval_error，并支持统计落盘。"
+            )
+        return (
+            "Unified bench evaluator supporting 6 text-only task archetypes.\n"
+            "Outputs: eval_score / eval_pred / eval_valid / eval_error with stats saved."
+        )
diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py
new file mode 100644
index 00000000..48976f72
--- /dev/null
+++ b/dataflow/operators/core_text/generate/bench_answer_generator.py
@@ -0,0 +1,250 @@
+from __future__ import annotations
+
+import json
+import re
+from typing import Any, Dict, List, Literal, Optional, Union
+
+import numpy as np
+import pandas as pd
+
+from dataflow import get_logger
+from dataflow.core import OperatorABC, LLMServingABC
+from dataflow.core.prompt import DIYPromptABC, prompt_restrict
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+
+
+@prompt_restrict()  # 保持通用, 不强绑固定 prompt 类
+
+@OPERATOR_REGISTRY.register()
+class BenchAnswerGenerator(OperatorABC):
+    """
+    用于 bench 评测的统一生成算子, 与 UnifiedBenchDatasetEvaluator 参数对齐
+
+    输入:
+      - eval_type: 评测类型, 取值同 evaluator
+      - keys_map: 指定各字段名, 同 evaluator
+      - context_key: 可选, 上下文字段名, 不传则 None
+    输出:
+      - output_key: 生成结果列, 默认 generated_ans
+      - 对于不需要生成的类型, 默认不写 output_key, 直接返回空列表
+    """
+
+    def __init__(
+        self,
+        llm_serving: LLMServingABC,
+        prompt_template: Optional[Union[DIYPromptABC, Any]] = None,
+        system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.",
+        allow_overwrite: bool = False,
+        # 是否强制对所有类型都生成, 默认只对需要 pred 的类型生成
+        force_generate: bool = False,
+    ):
+        self.logger = get_logger()
+        self.llm_serving = llm_serving
+        self.prompt_template = prompt_template
+        self.system_prompt = system_prompt
+        self.allow_overwrite = allow_overwrite
+        self.force_generate = force_generate
+
+    # ---------- 工具函数 ----------
+    def _normalize_context(self, ctx: Any) -> Optional[str]:
+        if ctx is None:
+            return None
+        if isinstance(ctx, float) and np.isnan(ctx):
+            return None
+        if isinstance(ctx, list):
+            parts = []
+            for x in ctx:
+                if x is None:
+                    continue
+                s = str(x).strip()
+                if s:
+                    parts.append(s)
+            return "\n".join(parts) if parts else None
+        s = str(ctx).strip()
+        return s if s else None
+
+    def _ensure_list(self, v: Any) -> Optional[List[str]]:
+        if v is None:
+            return None
+        if isinstance(v, float) and np.isnan(v):
+            return None
+        if isinstance(v, list):
+            return [str(x) for x in v]
+        s = str(v).strip()
+        if not s:
+            return None
+        # 尝试 json list
+        if s.startswith("[") and s.endswith("]"):
+            try:
+                obj = json.loads(s)
+                if isinstance(obj, list):
+                    return [str(x) for x in obj]
+            except Exception:
+                pass
+        return None
+
+    def _format_choices_text(self, choices: List[str]) -> str:
+        letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+        lines = []
+        for i, c in enumerate(choices):
+            tag = letters[i] if i < len(letters) else str(i)
+            lines.append(f"{tag}. {c}")
+        return "\n".join(lines)
+
+    def _build_prompt_fallback(
+        self,
+        *,
+        eval_type: str,
+        question: Optional[str],
+        context: Optional[str],
+        choices: Optional[List[str]],
+    ) -> str:
+        ctx_block = f"Context:\n{context}\n\n" if context else ""
+        q_block = f"Question:\n{(question or '').strip()}\n\n"
+
+        if eval_type in ("key2_qa", "key2_q_ma"):
+            return f"{ctx_block}{q_block}Answer:"
+        if eval_type == "key3_q_choices_a":
+            ch = self._format_choices_text(choices or [])
+            return f"{ctx_block}{q_block}Choices:\n{ch}\n\nChoose exactly one option. Output only the option letter (e.g., A).\nAnswer:"
+        if eval_type == "key3_q_choices_as":
+            ch = self._format_choices_text(choices or [])
+            return (
+                f"{ctx_block}{q_block}Choices:\n{ch}\n\n"
+                "This is a multi-select question. Output JSON only, format: {\"choices\": [\"A\",\"C\"]}.\nAnswer:"
+            )
+        # key1_text_score / key3_q_a_rejected 默认不需要生成
+        return f"{ctx_block}{q_block}Answer:"
+
+    def _build_prompt(
+        self,
+        *,
+        eval_type: str,
+        question: Optional[str],
+        context: Optional[str],
+        choices: Optional[List[str]],
+    ) -> str:
+        if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"):
+            try:
+                return self.prompt_template.build_prompt(
+                    eval_type=eval_type,
+                    question=question,
+                    context=context,
+                    choices=choices,
+                    choices_text=self._format_choices_text(choices) if choices else None,
+                )
+            except Exception as e:
+                self.logger.error(f"prompt_template.build_prompt 失败, fallback 默认模板: {e}")
+        return self._build_prompt_fallback(eval_type=eval_type, question=question, context=context, choices=choices)
+
+    def _call_generate(self, prompts: List[str]) -> List[str]:
+        if not hasattr(self.llm_serving, "generate_from_input"):
+            self.logger.error("llm_serving 缺少 generate_from_input 接口")
+            return [""] * len(prompts)
+        try:
+            # 兼容有无 system_prompt 参数
+            try:
+                return self.llm_serving.generate_from_input(user_inputs=prompts, system_prompt=self.system_prompt)
+            except TypeError:
+                return self.llm_serving.generate_from_input(prompts)
+        except Exception as e:
+            self.logger.error(f"generate_from_input 执行失败: {e}")
+            return [""] * len(prompts)
+
+    def _need_generation(self, eval_type: str) -> bool:
+        # evaluator 当前实现里:
+        # - key1_text_score: 不需要 generated_ans
+        # - key2_qa / key2_q_ma: 需要 generated_ans
+        # - key3_q_choices_a: 若 evaluator 用 ll 则不需要; 但为了可测试/兜底, 这里默认生成
+        # - key3_q_choices_as: evaluator 当前用解析 generated_ans -> 需要
+        # - key3_q_a_rejected: evaluator 用 ll 比较 better vs rejected -> 不需要
+        if self.force_generate:
+            return eval_type != "key1_text_score"
+        return eval_type in ("key2_qa", "key2_q_ma", "key3_q_choices_a", "key3_q_choices_as")
+
+    # ---------- 主入口 ----------
+    def run(
+        self,
+        storage: DataFlowStorage,
+        eval_type: Literal[
+            "key1_text_score",
+            "key2_qa",
+            "key2_q_ma",
+            "key3_q_choices_a",
+            "key3_q_choices_as",
+            "key3_q_a_rejected",
+        ],
+        keys_map: Dict[str, str],
+        context_key: Optional[str] = None,
+        output_key: str = "generated_ans",
+    ) -> List[str]:
+        df = storage.read("dataframe")
+
+        if not self._need_generation(eval_type):
+            self.logger.info(f"[BenchAnswerGenerator] eval_type={eval_type} 默认不需要生成, 跳过")
+            storage.write(df)
+            return []
+
+        if (output_key in df.columns) and (not self.allow_overwrite):
+            self.logger.error(f"输出列已存在且不允许覆盖: {output_key}")
+            storage.write(df)
+            return []
+
+        # 读取字段
+        q_col = keys_map.get("question")
+        if not q_col or q_col not in df.columns:
+            self.logger.error(f"缺少 question 列, keys_map.question={q_col}")
+            storage.write(df)
+            return []
+
+        ch_col = keys_map.get("choices")
+        need_choices = eval_type in ("key3_q_choices_a", "key3_q_choices_as")
+        if need_choices and (not ch_col or ch_col not in df.columns):
+            self.logger.error(f"缺少 choices 列, keys_map.choices={ch_col}")
+            storage.write(df)
+            return []
+
+        ctx_series = None
+        if context_key:
+            if context_key in df.columns:
+                ctx_series = df[context_key]
+            else:
+                self.logger.error(f"context_key 不存在: {context_key}, 视为 None")
+
+        prompts: List[str] = []
+        for idx, row in df.iterrows():
+            q = row[q_col]
+            ctx = self._normalize_context(ctx_series.loc[idx]) if ctx_series is not None else None
+
+            choices = None
+            if need_choices:
+                choices = self._ensure_list(row[ch_col])
+                if not choices:
+                    # choices 为空, 仍然生成一个可追踪的输出, 避免整体崩
+                    choices = [""]
+
+            prompts.append(
+                self._build_prompt(
+                    eval_type=eval_type,
+                    question=str(q) if q is not None else "",
+                    context=ctx,
+                    choices=choices,
+                )
+            )
+
+        answers = self._call_generate(prompts)
+        df[output_key] = answers
+        out_file = storage.write(df)
+        self.logger.info(f"[BenchAnswerGenerator] 生成完成, 保存到 {out_file}")
+        return [output_key]
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "用于 bench 评测的统一生成算子, 与 evaluator 的 eval_type + keys_map 对齐。\n"
+                "默认只对需要生成输出的类型生成 output_key=generated_ans, 并支持 context_key 作为可选上下文。\n"
+                "可通过 allow_overwrite 控制是否覆盖已存在的输出列。"
+            )
+        return "Unified bench answer generator aligned with evaluator eval_type and keys_map."
\ No newline at end of file
diff --git a/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py b/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py
new file mode 100644
index 00000000..67de5601
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py
@@ -0,0 +1,66 @@
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+    
+DIY_PROMPT_ANSWER = """Please output the answer."""
+
+class UnifiedBenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/bench_eval_data.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/scy/Model/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            prompt_template=None,
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type="key1_text_score",
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map={"text": "text"},
+            context_key=None,
+            output_key="generated_ans",
+        )
+        """
+        all types:
+        "key1_text_score",
+        "key2_qa",
+        "key2_q_ma",
+        "key3_q_choices_a",
+        "key3_q_choices_as",
+        "key3_q_a_rejected",
+        """
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map={"text": "text"},
+            context_key=None,
+            input_pred_key="generated_ans",
+
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.forward()

From da0404fa52837cb4d1c9344d3f1e0626768e941a Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Mon, 5 Jan 2026 23:27:43 +0800
Subject: [PATCH 2/6] add all types of evaluation of model

---
 .../unified_bench_eval_type1.jsonl            |   3 +-
 .../unified_bench_eval_type2.jsonl            |  15 ++
 .../unified_bench_eval_type3.jsonl            |   6 +
 .../unified_bench_eval_type4.jsonl            |  10 ++
 .../unified_bench_eval_type5.jsonl            |  10 ++
 .../unified_bench_eval_type6.jsonl            |  10 ++
 dataflow/operators/core_text/__init__.py      |   1 +
 .../eval/unified_bench_dataset_evaluator.py   | 158 ++++++++++++++++--
 .../generate/bench_answer_generator.py        |  54 +++---
 .../unified_bench_eval_pipeline.py            |  35 ++--
 .../unified_bench_eval_type1.py               |  71 ++++++++
 .../unified_bench_eval_type2.py               |  81 +++++++++
 .../unified_bench_eval_type3.py               |  85 ++++++++++
 .../unified_bench_eval_type4.py               |  84 ++++++++++
 .../unified_bench_eval_type5.py               |  84 ++++++++++
 .../unified_bench_eval_type6.py               |  84 ++++++++++
 16 files changed, 744 insertions(+), 47 deletions(-)
 create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type2.jsonl
 create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type3.jsonl
 create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type4.jsonl
 create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type5.jsonl
 create mode 100644 dataflow/example/core_text_data/unified_bench_eval_type6.jsonl
 rename dataflow/statics/pipelines/gpu_pipelines/{ => benchmark_eval}/unified_bench_eval_pipeline.py (76%)
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py

diff --git a/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl
index a76d9002..1d66ba16 100644
--- a/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl
+++ b/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl
@@ -2,5 +2,4 @@
 {"text": "The capital of France is Paris."}
 {"text": "Please evaluate the language model perplexity on this short example."}
 {"text": "Machine learning enables computers to learn patterns from data."}
-{"text": "Perplexity is a common metric for evaluating language models on text scoring tasks."}
-{"text": "666233gigity"}
\ No newline at end of file
+{"text": "Perplexity is a common metric for evaluating language models on text scoring tasks."}
\ No newline at end of file
diff --git a/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl
new file mode 100644
index 00000000..54f28264
--- /dev/null
+++ b/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl
@@ -0,0 +1,15 @@
+{"id": "math_001", "question": "Solve for x: 2x + 3 = 11.", "model_answer": "Let's isolate x. Subtract 3 from both sides: 2x = 8. Then divide both sides by 2 to get x = 4. Therefore, the final answer is x = 4.", "golden_label": "x = 4"}
+{"id": "math_002", "question": "If a circle has a radius of 5, what is its area?", "model_answer": "We know the formula for the area of a circle is 2πr. Plugging in r = 5 gives area = 10π. Hence, the area is 10π square units.", "golden_label": "25π"}
+{"id": "math_003", "question": "Find the derivative of f(x) = x².", "model_answer": "Using the power rule, the derivative of x² is 2x¹ = 2x. Therefore, f'(x) = 2x.", "golden_label": "2x"}
+{"id": "math_004", "question": "Solve 3x - 7 = 11.", "model_answer": "Start by adding 7 to both sides: 3x = 18. Divide both sides by 3 to isolate x, which gives x = 6. So the correct solution is x = 6.", "golden_label": "x = 6"}
+{"id": "math_005", "question": "A triangle has sides of lengths 3, 4, and 5. What is its area?", "model_answer": "Since 3² + 4² = 5², it’s a right triangle. The area of a right triangle is ½ × base × height = ½ × 3 × 4 = 6. Hence, the area is 6 square units.", "golden_label": "6"}
+{"id": "math_006", "question": "Find the vertex of the parabola y = x² + 2x + 1.", "model_answer": "The vertex formula is (-b/2a, c - b²/4a). Here, a = 1, b = 2, c = 1. Compute: -b/2a = -2/2 = -1, and y = 1 - 1 = 0. Wait, I think the vertex is (1, 0).", "golden_label": "(-1, 0)"}
+{"id": "math_007", "question": "Simplify (x² - 1)/(x - 1).", "model_answer": "Factor the numerator: (x + 1)(x - 1)/(x - 1). The (x - 1) terms cancel, leaving x + 1. Therefore, the simplified result is x + 1.", "golden_label": "x + 1"}
+{"id": "math_008", "question": "Evaluate log₂(8).", "model_answer": "We know that 2² = 8, so log₂(8) = 2. Therefore, the result is 2.", "golden_label": "3"}
+{"id": "math_009", "question": "Find the indefinite integral ∫x dx.", "model_answer": "The integral of x with respect to x is x²/2 + C. So the answer is x²/2 + C.", "golden_label": "x²/2 + C"}
+{"id": "math_010", "question": "Compute the slope of the line passing through (1, 2) and (4, 8).", "model_answer": "Slope m = (y₂ - y₁)/(x₂ - x₁) = (8 - 2)/(4 - 1) = 6/3 = 2. Hence, the slope is 2.", "golden_label": "2"}
+{"id": "math_011", "question": "Expand (x + 3)².", "model_answer": "Using the binomial theorem: (x + 3)² = x² + 2×x×3 + 3² = x² + 6x + 9. Therefore, the expanded form is x² + 6x + 9.", "golden_label": "x² + 6x + 9"}
+{"id": "math_012", "question": "What is sin(90°)?", "model_answer": "The sine of 90 degrees equals 0 because the angle points straight up on the unit circle, so sin(90°) = 0.", "golden_label": "1"}
+{"id": "math_013", "question": "Find the determinant of [[2, 3], [1, 4]].", "model_answer": "The determinant is ad - bc = 2×4 - 3×1 = 8 - 3 = 5. Therefore, det = 5.", "golden_label": "5"}
+{"id": "math_014", "question": "Compute 7 × 8.", "model_answer": "7 × 8 equals 54. I multiplied 7 × 7 = 49 and added 5, giving 54.", "golden_label": "56"}
+{"id": "math_015", "question": "Find the derivative of f(x) = sin(x).", "model_answer": "The derivative of sin(x) with respect to x is cos(x). Thus, f'(x) = cos(x).", "golden_label": "cos(x)"}
diff --git a/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl
new file mode 100644
index 00000000..042755a4
--- /dev/null
+++ b/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl
@@ -0,0 +1,6 @@
+{"eval_type":"key2_q_ma","id":"ma_0001","context":"France's capital city is Paris.","question":"What is the capital of France?","targets":["Paris","The capital of France is Paris."]}
+{"eval_type":"key2_q_ma","id":"ma_0002","context":"The chemical symbol for water is H2O.","question":"What is the chemical formula for water?","targets":["H2O","h2o"]}
+{"eval_type":"key2_q_ma","id":"ma_0003","context":"Python is a popular programming language created by Guido van Rossum.","question":"Who created Python?","targets":["Guido van Rossum","Guido"]}
+{"eval_type":"key2_q_ma","id":"ma_0004","context":"The largest planet in our solar system is Jupiter.","question":"Which is the largest planet in the solar system?","targets":["Jupiter","The largest planet is Jupiter."]}
+{"eval_type":"key2_q_ma","id":"ma_0005","context":"Light travels at approximately 300,000 kilometers per second in vacuum.","question":"What is the approximate speed of light in vacuum?","targets":["300000 km/s","300,000 km/s","3e5 km/s","approximately 300,000 kilometers per second"]}
+{"eval_type":"key2_q_ma","id":"ma_0006","context":"Shakespeare wrote the tragedy Hamlet.","question":"Who wrote Hamlet?","targets":["William Shakespeare","Shakespeare"]}
diff --git a/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl
new file mode 100644
index 00000000..db5d04b5
--- /dev/null
+++ b/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl
@@ -0,0 +1,10 @@
+{"eval_type":"key3_q_choices_a","id":"mc_0001","context":null,"question":"What is the capital of France?","choices":["Paris","London","Berlin","Rome"],"label":0}
+{"eval_type":"key3_q_choices_a","id":"mc_0002","context":null,"question":"In Python, what does len([1, 2, 3]) return?","choices":["2","3","4","An error"],"label":1}
+{"eval_type":"key3_q_choices_a","id":"mc_0003","context":"Assume standard SI units.","question":"Which physical quantity is measured in Newtons (N)?","choices":["Energy","Force","Power","Voltage"],"label":1}
+{"eval_type":"key3_q_choices_a","id":"mc_0004","context":null,"question":"Which planet is the largest in the Solar System?","choices":["Earth","Mars","Jupiter","Venus"],"label":2}
+{"eval_type":"key3_q_choices_a","id":"mc_0005","context":"Consider basic probability with a fair six-sided die.","question":"What is the probability of rolling a 6?","choices":["1/2","1/3","1/6","1/12"],"label":2}
+{"eval_type":"key3_q_choices_a","id":"mc_0006","context":"You are reading an English sentence.","question":"Choose the word that best completes the sentence: \"She ___ to the store yesterday.\"","choices":["go","goes","went","going"],"label":2}
+{"eval_type":"key3_q_choices_a","id":"mc_0007","context":"Pick the most appropriate next step in the sequence.","question":"What is the next number in the sequence 2, 4, 8, 16, ?","choices":["18","24","32","34"],"label":2}
+{"eval_type":"key3_q_choices_a","id":"mc_0008","context":"Computer science basics.","question":"Which data structure uses FIFO (first in, first out)?","choices":["Stack","Queue","Tree","Heap"],"label":1}
+{"eval_type":"key3_q_choices_a","id":"mc_0009","context":"Geography.","question":"Which ocean is the largest by surface area?","choices":["Indian Ocean","Atlantic Ocean","Arctic Ocean","Pacific Ocean"],"label":3}
+{"eval_type":"key3_q_choices_a","id":"mc_0010","context":"Mathematics.","question":"If x = 3, what is the value of 2x + 5?","choices":["8","10","11","12"],"label":2}
diff --git a/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl
new file mode 100644
index 00000000..c4c8e567
--- /dev/null
+++ b/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl
@@ -0,0 +1,10 @@
+{"eval_type":"key3_q_choices_as","id":"ms_0001","context":null,"question":"Which of the following are prime numbers?","choices":["2","9","11","15"],"labels":[0,2]}
+{"eval_type":"key3_q_choices_as","id":"ms_0002","context":"Select all correct statements about Python.","question":"Which of the following are valid ways to create a list in Python?","choices":["[]","list()","{}","()"],"labels":[0,1]}
+{"eval_type":"key3_q_choices_as","id":"ms_0003","context":"Consider basic linear algebra.","question":"Which of the following are valid matrix operations?","choices":["Matrix addition (same shape)","Matrix multiplication (inner dims match)","Element-wise division is always defined","Taking the determinant of a non-square matrix"],"labels":[0,1]}
+{"eval_type":"key3_q_choices_as","id":"ms_0004","context":"HTTP request methods are standardized verbs.","question":"Which of the following are HTTP methods?","choices":["GET","FETCH","POST","PUSH"],"labels":[0,2]}
+{"eval_type":"key3_q_choices_as","id":"ms_0005","context":"JSON has a small set of primitive and composite types.","question":"Which of the following are valid JSON value types?","choices":["string","tuple","number","object"],"labels":[0,2,3]}
+{"eval_type":"key3_q_choices_as","id":"ms_0006","context":"Pick all that apply.","question":"Which tasks are typically supervised learning?","choices":["Image classification","K-means clustering","Linear regression","PCA"],"labels":[0,2]}
+{"eval_type":"key3_q_choices_as","id":"ms_0007","context":"Recall basic operating system concepts.","question":"Which of the following are common process scheduling algorithms?","choices":["Round-robin","Shortest Job First","Breadth-first search","First Come First Served"],"labels":[0,1,3]}
+{"eval_type":"key3_q_choices_as","id":"ms_0008","context":"Select all true statements about TCP.","question":"Which of the following are features of TCP?","choices":["Connection-oriented","Guarantees in-order delivery","Message boundaries are preserved","Congestion control mechanisms exist"],"labels":[0,1,3]}
+{"eval_type":"key3_q_choices_as","id":"ms_0009","context":"Choose all correct options.","question":"Which of the following numbers are divisible by 3?","choices":["21","22","24","25"],"labels":[0,2]}
+{"eval_type":"key3_q_choices_as","id":"ms_0010","context":"Basic set theory.","question":"Which of the following sets are subsets of {1,2,3}?","choices":["{1,2}","{2,4}","{ }","{1,2,3,4}"],"labels":[0,2]}
diff --git a/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl b/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl
new file mode 100644
index 00000000..8b562084
--- /dev/null
+++ b/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl
@@ -0,0 +1,10 @@
+{"eval_type":"key3_q_a_rejected","id":"pw_0001","context":null,"question":"Explain what overfitting is in machine learning.","better":"Overfitting is when a model learns the training data too closely, including noise, so it performs well on training data but poorly on new, unseen data.","rejected":"Overfitting means the model is too good and always performs well everywhere."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0002","context":"Answer concisely in one sentence.","question":"What is the capital of France?","better":"Paris.","rejected":"France is a country in Europe with many cities."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0003","context":"Provide a clear step-by-step solution.","question":"Solve: 2x + 5 = 11.","better":"Subtract 5 from both sides to get 2x=6, then divide by 2 to get x=3.","rejected":"x is 11 because 2x plus 5 is 11."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0004","context":"You are writing documentation for a Python beginner.","question":"What does a Python list represent?","better":"A list is an ordered, mutable collection of items, written with square brackets like [1, 2, 3].","rejected":"A list is like a dictionary but faster and uses curly braces."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0005","context":"Give a polite refusal.","question":"Can you share my friend's private phone number?","better":"Sorry, I can’t help with sharing someone’s private contact information without their permission.","rejected":"Sure, tell me their name and I’ll provide the number."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0006","context":"Explain in simple terms.","question":"What is an API?","better":"An API is a set of rules that lets different software systems talk to each other, like a menu of requests you can make and the responses you’ll get.","rejected":"An API is a database that stores all your application’s data."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0007","context":"Answer with one short paragraph.","question":"Why do we use HTTPS instead of HTTP?","better":"HTTPS encrypts data between your browser and the server, which helps prevent eavesdropping and tampering, improving security and trust.","rejected":"HTTPS is used because it makes websites load faster than HTTP in all cases."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0008","context":"Focus on correctness.","question":"What is the derivative of x^2?","better":"The derivative of x^2 with respect to x is 2x.","rejected":"The derivative of x^2 is x."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0009","context":"Return a direct answer.","question":"How many bytes are in a kilobyte (KB) in the decimal system?","better":"1 KB is 1000 bytes in the decimal (SI) system.","rejected":"1 KB is always 1024 bytes no matter what."}
+{"eval_type":"key3_q_a_rejected","id":"pw_0010","context":"Be clear and practical.","question":"How can you reduce Python virtual environment dependency conflicts?","better":"Pin dependencies with exact versions, use a lock file when possible, isolate projects per environment, and upgrade packages in a controlled way.","rejected":"Just install everything globally; conflicts will resolve themselves."}
diff --git a/dataflow/operators/core_text/__init__.py b/dataflow/operators/core_text/__init__.py
index 2ed5a57e..592469c9 100644
--- a/dataflow/operators/core_text/__init__.py
+++ b/dataflow/operators/core_text/__init__.py
@@ -11,6 +11,7 @@
     from .generate.bench_answer_generator import BenchAnswerGenerator
     from .eval.bench_dataset_evaluator import BenchDatasetEvaluator
     from .eval.bench_dataset_evaluator_question import BenchDatasetEvaluatorQuestion
+    from .eval.unified_bench_dataset_evaluator import UnifiedBenchDatasetEvaluator
     from .eval.text2qa_sample_evaluator import Text2QASampleEvaluator
     from .eval.prompted_eval import PromptedEvaluator
     from .filter.prompted_filter import PromptedFilter
diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
index 6a09990b..e0199dc6 100644
--- a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
+++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
@@ -4,6 +4,7 @@
 import os
 import re
 import time
+import unicodedata
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 import numpy as np
@@ -189,14 +190,47 @@ def _build_prompt(
     # -----------------------------
     # math_verify compare
     # -----------------------------
-    def _math_verify_compare(self, answer: Any, ground_truth: Any) -> bool:
+    def _try_math_verify_compare(self, answer: Any, ground_truth: Any) -> Optional[bool]:
         try:
             return verify(parse(str(ground_truth)), parse(str(answer)))
         except Exception:
             try:
                 return verify(parse(ground_truth), parse(answer))
             except Exception:
-                return False
+                return None
+
+    def _math_verify_compare(self, answer: Any, ground_truth: Any) -> bool:
+        res = self._try_math_verify_compare(answer, ground_truth)
+        return bool(res) if res is not None else False
+
+    def _normalize_text_for_match(self, text: Any) -> str:
+        if text is None:
+            return ""
+        s = unicodedata.normalize("NFKC", str(text))
+        s = s.translate(str.maketrans({
+            "₀": "0",
+            "₁": "1",
+            "₂": "2",
+            "₃": "3",
+            "₄": "4",
+            "₅": "5",
+            "₆": "6",
+            "₇": "7",
+            "₈": "8",
+            "₉": "9",
+        }))
+        s = s.strip()
+        s = re.sub(r"\s+", " ", s)
+        if s.endswith((".", "。", "!", "！", "?", "？")):
+            s = s[:-1].strip()
+        return s.casefold()
+
+    def _text_contains_match(self, pred: Any, ref: Any) -> bool:
+        p = self._normalize_text_for_match(pred)
+        r = self._normalize_text_for_match(ref)
+        if not p or not r:
+            return False
+        return (r in p) or (p in r)
 
     # -----------------------------
     # 多参考答案：把 targets 解析成 List[str]
@@ -367,8 +401,106 @@ def _ll_batch(self, prompts: List[str], continuations: List[str]) -> Optional[Li
                     self.logger.error(f"llm_serving.{name} failed: {e}")
                     return None
 
-        self.logger.error("llm_serving does not provide any loglikelihood/score interface.")
-        return None
+        model_id = getattr(self.llm_serving, "real_model_path", None) or getattr(self.llm_serving, "hf_model_name_or_path", None)
+        hf_cache_dir = getattr(self.llm_serving, "hf_cache_dir", None)
+        trust_remote_code = getattr(self.llm_serving, "trust_remote_code", True)
+
+        if model_id is None:
+            self.logger.error("llm_serving does not expose real_model_path/hf_model_name_or_path; cannot compute loglikelihood.")
+            return None
+
+        try:
+            tokenizer = getattr(self, "_ll_hf_tokenizer", None)
+            model = getattr(self, "_ll_hf_model", None)
+            loaded_id = getattr(self, "_ll_hf_model_id", None)
+            if tokenizer is None or model is None or loaded_id != model_id:
+                tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code)
+                model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir=hf_cache_dir, trust_remote_code=trust_remote_code)
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                model.to(device)
+                model.eval()
+                self._ll_hf_tokenizer = tokenizer
+                self._ll_hf_model = model
+                self._ll_hf_model_id = model_id
+        except Exception as e:
+            self.logger.error(f"failed to load hf model/tokenizer for loglikelihood: {e}")
+            return None
+
+        try:
+            device = next(model.parameters()).device
+            pad_id = tokenizer.pad_token_id
+            if pad_id is None:
+                pad_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0
+
+            batch_size = 4
+            lls: List[float] = []
+
+            def _safe_ids(text: str) -> List[int]:
+                return tokenizer(text, add_special_tokens=False).input_ids
+
+            for start in range(0, len(prompts), batch_size):
+                ps = ["" if p is None else str(p) for p in prompts[start:start + batch_size]]
+                cs = ["" if c is None else str(c) for c in continuations[start:start + batch_size]]
+
+                full_ids_list: List[List[int]] = []
+                prompt_lens: List[int] = []
+                cont_lens: List[int] = []
+
+                for p, c in zip(ps, cs):
+                    full_ids = _safe_ids(p + c)
+                    p_ids = _safe_ids(p)
+                    if len(p_ids) <= len(full_ids) and full_ids[:len(p_ids)] == p_ids:
+                        prompt_len = len(p_ids)
+                    else:
+                        c_ids = _safe_ids(c)
+                        prompt_len = max(0, len(full_ids) - len(c_ids))
+                    cont_len = max(0, len(full_ids) - prompt_len)
+                    full_ids_list.append(full_ids)
+                    prompt_lens.append(prompt_len)
+                    cont_lens.append(cont_len)
+
+                max_len = max((len(x) for x in full_ids_list), default=0)
+                if max_len == 0:
+                    lls.extend([0.0] * len(full_ids_list))
+                    continue
+
+                input_ids = torch.full((len(full_ids_list), max_len), pad_id, dtype=torch.long, device=device)
+                attention_mask = torch.zeros((len(full_ids_list), max_len), dtype=torch.long, device=device)
+                for i, ids in enumerate(full_ids_list):
+                    if not ids:
+                        continue
+                    t = torch.tensor(ids, dtype=torch.long, device=device)
+                    input_ids[i, : t.numel()] = t
+                    attention_mask[i, : t.numel()] = 1
+
+                with torch.no_grad():
+                    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
+                    log_probs = F.log_softmax(logits, dim=-1)
+
+                shift_log_probs = log_probs[:, :-1, :].contiguous()
+                shift_labels = input_ids[:, 1:].contiguous()
+                token_ll = shift_log_probs.gather(-1, shift_labels.unsqueeze(-1)).squeeze(-1)
+
+                for i in range(len(full_ids_list)):
+                    cont_len = cont_lens[i]
+                    prompt_len = prompt_lens[i]
+                    if cont_len <= 0:
+                        lls.append(0.0)
+                        continue
+                    start_pos = max(prompt_len, 1)
+                    end_pos = prompt_len + cont_len
+                    start_idx = start_pos - 1
+                    end_idx = end_pos - 1
+                    if end_idx <= start_idx:
+                        lls.append(0.0)
+                        continue
+                    ll_val = float(token_ll[i, start_idx:end_idx].sum().detach().cpu())
+                    lls.append(ll_val)
+
+            return lls
+        except Exception as e:
+            self.logger.error(f"hf loglikelihood computation failed: {e}")
+            return None
 
     def _ppl_batch(self, texts: List[str]) -> Optional[List[float]]:
         if self.llm_serving is None:
@@ -478,8 +610,6 @@ def run(
         df = storage.read("dataframe")
         eval_type = self.eval_type
 
-
-
         # 输出列统一
         if "eval_valid" not in df.columns:
             df["eval_valid"] = True
@@ -787,7 +917,7 @@ def _eval_qa_single(
                 df["eval_error"] = "semantic_judge_unavailable"
                 return
 
-            # 默认用“预测 vs 标准”直接 judge（你旧逻辑那套需要特定 Prompt，这里只做通用；你可自行替换为你自己的 AnswerJudgePrompt）
+            # 默认用“预测 vs 标准”直接 judge（这里只做通用；可自行替换 AnswerJudgePrompt）
             inputs = []
             row_indices = []
             for idx, row in df.iterrows():
@@ -845,9 +975,11 @@ def _eval_qa_single(
                 continue
 
             final_answer = self.answer_extractor.extract_answer(pred_raw, None)
-            ok = self._math_verify_compare(final_answer, gt)
+            text_ok = self._text_contains_match(pred_raw, gt) or self._text_contains_match(final_answer, gt)
+            math_res = self._try_math_verify_compare(final_answer, gt)
+            ok = text_ok or (math_res is True)
             df.at[idx, "eval_score"] = 1.0 if ok else 0.0
-            df.at[idx, "eval_pred"] = str(final_answer)
+            df.at[idx, "eval_pred"] = str(final_answer) if (math_res is True) else str(pred_raw)
             df.at[idx, "eval_valid"] = True
             df.at[idx, "eval_error"] = ""
 
@@ -880,13 +1012,17 @@ def _eval_qa_multi(
 
             final_answer = self.answer_extractor.extract_answer(pred_raw, None)
             ok_any = False
+            matched_by_text = False
             for gt in targets:
-                if self._math_verify_compare(final_answer, gt):
+                text_ok = self._text_contains_match(pred_raw, gt) or self._text_contains_match(final_answer, gt)
+                math_res = self._try_math_verify_compare(final_answer, gt)
+                if text_ok or (math_res is True):
                     ok_any = True
+                    matched_by_text = matched_by_text or text_ok
                     break
 
             df.at[idx, "eval_score"] = 1.0 if ok_any else 0.0
-            df.at[idx, "eval_pred"] = str(final_answer)
+            df.at[idx, "eval_pred"] = str(pred_raw) if matched_by_text else str(final_answer)
             df.at[idx, "eval_valid"] = True
             df.at[idx, "eval_error"] = ""
 
diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py
index 48976f72..bcc619fe 100644
--- a/dataflow/operators/core_text/generate/bench_answer_generator.py
+++ b/dataflow/operators/core_text/generate/bench_answer_generator.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import inspect
 import re
 from typing import Any, Dict, List, Literal, Optional, Union
 
@@ -32,7 +33,15 @@ class BenchAnswerGenerator(OperatorABC):
 
     def __init__(
         self,
-        llm_serving: LLMServingABC,
+        eval_type: Literal[
+                "key1_text_score",
+                "key2_qa",
+                "key2_q_ma",
+                "key3_q_choices_a",
+                "key3_q_choices_as",
+                "key3_q_a_rejected",
+            ] = "key2_qa",
+        llm_serving: Optional[LLMServingABC] = None,
         prompt_template: Optional[Union[DIYPromptABC, Any]] = None,
         system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.",
         allow_overwrite: bool = False,
@@ -45,6 +54,7 @@ def __init__(
         self.system_prompt = system_prompt
         self.allow_overwrite = allow_overwrite
         self.force_generate = force_generate
+        self.eval_type = eval_type
 
     # ---------- 工具函数 ----------
     def _normalize_context(self, ctx: Any) -> Optional[str]:
@@ -127,13 +137,25 @@ def _build_prompt(
     ) -> str:
         if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"):
             try:
-                return self.prompt_template.build_prompt(
-                    eval_type=eval_type,
-                    question=question,
-                    context=context,
-                    choices=choices,
-                    choices_text=self._format_choices_text(choices) if choices else None,
-                )
+                fn = getattr(self.prompt_template, "build_prompt")
+                kwargs = {
+                    "eval_type": eval_type,
+                    "question": question,
+                    "context": context,
+                    "choices": choices,
+                    "choices_text": self._format_choices_text(choices) if choices else None,
+                }
+                kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+                sig = inspect.signature(fn)
+                params = sig.parameters.values()
+                has_varkw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params)
+                if has_varkw:
+                    return fn(**kwargs)
+
+                accepted = {p.name for p in params if p.name != "self"}
+                filtered = {k: v for k, v in kwargs.items() if k in accepted}
+                return fn(**filtered)
             except Exception as e:
                 self.logger.error(f"prompt_template.build_prompt 失败, fallback 默认模板: {e}")
         return self._build_prompt_fallback(eval_type=eval_type, question=question, context=context, choices=choices)
@@ -156,30 +178,24 @@ def _need_generation(self, eval_type: str) -> bool:
         # evaluator 当前实现里:
         # - key1_text_score: 不需要 generated_ans
         # - key2_qa / key2_q_ma: 需要 generated_ans
-        # - key3_q_choices_a: 若 evaluator 用 ll 则不需要; 但为了可测试/兜底, 这里默认生成
+        # - key3_q_choices_a: evaluator 可用 ll 做选择题评估 -> 默认不生成
         # - key3_q_choices_as: evaluator 当前用解析 generated_ans -> 需要
         # - key3_q_a_rejected: evaluator 用 ll 比较 better vs rejected -> 不需要
         if self.force_generate:
             return eval_type != "key1_text_score"
-        return eval_type in ("key2_qa", "key2_q_ma", "key3_q_choices_a", "key3_q_choices_as")
+        return eval_type in ("key2_qa", "key2_q_ma", "key3_q_choices_as")
 
     # ---------- 主入口 ----------
     def run(
         self,
         storage: DataFlowStorage,
-        eval_type: Literal[
-            "key1_text_score",
-            "key2_qa",
-            "key2_q_ma",
-            "key3_q_choices_a",
-            "key3_q_choices_as",
-            "key3_q_a_rejected",
-        ],
         keys_map: Dict[str, str],
         context_key: Optional[str] = None,
         output_key: str = "generated_ans",
     ) -> List[str]:
+
         df = storage.read("dataframe")
+        eval_type = self.eval_type
 
         if not self._need_generation(eval_type):
             self.logger.info(f"[BenchAnswerGenerator] eval_type={eval_type} 默认不需要生成, 跳过")
@@ -247,4 +263,4 @@ def get_desc(lang: str = "zh"):
                 "默认只对需要生成输出的类型生成 output_key=generated_ans, 并支持 context_key 作为可选上下文。\n"
                 "可通过 allow_overwrite 控制是否覆盖已存在的输出列。"
             )
-        return "Unified bench answer generator aligned with evaluator eval_type and keys_map."
\ No newline at end of file
+        return "Unified bench answer generator aligned with evaluator eval_type and keys_map."
diff --git a/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py
similarity index 76%
rename from dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py
rename to dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py
index 67de5601..f18de41b 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/unified_bench_eval_pipeline.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py
@@ -2,27 +2,40 @@
 from dataflow.utils.storage import FileStorage
 from dataflow.serving import LocalModelLLMServing_vllm
 from dataflow.core import LLMServingABC
-    
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
 DIY_PROMPT_ANSWER = """Please output the answer."""
+EVAL_TYPE = "key1_text_score"
+KEY_MAPS = {"text": "text"}
 
 class UnifiedBenchEvalPipeline():
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         
         self.storage = FileStorage(
-            first_entry_file_name="../example_data/core_text_data/bench_eval_data.jsonl",
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/scy/Model/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
 
         self.answer_generator_step1 = BenchAnswerGenerator(
             llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
             prompt_template=None,
             allow_overwrite=False,
             force_generate=False,
@@ -31,7 +44,7 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
         self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
             eval_result_path="./cache_local/eval_result/eval_result.jsonl",
             llm_serving=self.llm_serving_generator,
-            eval_type="key1_text_score",
+            eval_type=EVAL_TYPE,
             prompt_template=None,
             use_semantic_judge=False,
             metric_type=None,           # use default metric
@@ -40,22 +53,14 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map={"text": "text"},
+            keys_map=KEY_MAPS,
             context_key=None,
             output_key="generated_ans",
         )
-        """
-        all types:
-        "key1_text_score",
-        "key2_qa",
-        "key2_q_ma",
-        "key3_q_choices_a",
-        "key3_q_choices_as",
-        "key3_q_a_rejected",
-        """
+
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map={"text": "text"},
+            keys_map=KEY_MAPS,
             context_key=None,
             input_pred_key="generated_ans",
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
new file mode 100644
index 00000000..f18de41b
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
@@ -0,0 +1,71 @@
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+DIY_PROMPT_ANSWER = """Please output the answer."""
+EVAL_TYPE = "key1_text_score"
+KEY_MAPS = {"text": "text"}
+
+class UnifiedBenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            input_pred_key="generated_ans",
+
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.forward()
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
new file mode 100644
index 00000000..14271498
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
@@ -0,0 +1,81 @@
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+EVAL_TYPE = "key2_qa"
+KEY_MAPS = {
+    "question": "question",
+    "target": "golden_label"
+}
+
+class AnswerGeneratePromptDIY(DIYPromptABC):
+    def build_prompt(self, question:str = None):
+        prompt = f"""
+        Question: {question}
+        Answer:
+        """
+        return prompt
+
+class UnifiedBenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=AnswerGeneratePromptDIY(),
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            input_pred_key="generated_ans",
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.forward()
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
new file mode 100644
index 00000000..e69fca06
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
@@ -0,0 +1,85 @@
+from dataflow.pipeline.Pipeline import PipelineABC
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+EVAL_TYPE = "key2_q_ma"
+KEY_MAPS = {
+    "context": "context", # optional
+    "question": "question",
+    "targets": "targets"
+}
+
+class AnswerGeneratePromptDIY(DIYPromptABC):
+    def build_prompt(self, question:str = None):
+        prompt = f"""
+        Question: {question}
+        Answer:
+        """
+        return prompt
+
+class UnifiedBenchEvalPipeline(PipelineABC):
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        super().__init__()
+        
+        self.storage = FileStorage(
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=AnswerGeneratePromptDIY(),
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            input_pred_key="generated_ans",
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.compile()
+    pl.forward()
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
new file mode 100644
index 00000000..d522f265
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
@@ -0,0 +1,84 @@
+from dataflow.pipeline.Pipeline import PipelineABC
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+EVAL_TYPE = "key3_q_choices_a"
+KEY_MAPS = {
+    "context": "context", # optional
+    "question": "question",
+    "choices": "choices",
+    "label": "label"
+}
+
+class MMLUPromptDIY(DIYPromptABC):
+    def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs):
+        ctx = f"Context:\n{context}\n\n" if context else ""
+        return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:"
+
+
+class UnifiedBenchEvalPipeline(PipelineABC):
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        super().__init__()
+        
+        self.storage = FileStorage(
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=MMLUPromptDIY(),
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            input_pred_key="generated_ans",
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.compile()
+    pl.forward()
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
new file mode 100644
index 00000000..90ca3f05
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
@@ -0,0 +1,84 @@
+from dataflow.pipeline.Pipeline import PipelineABC
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+EVAL_TYPE = "key3_q_choices_as"
+KEY_MAPS = {
+    "context": "context", # optional
+    "question": "question",
+    "choices": "choices",
+    "labels": "labels"
+}
+
+class MMLUPromptDIY(DIYPromptABC):
+    def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs):
+        ctx = f"Context:\n{context}\n\n" if context else ""
+        return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:"
+
+
+class UnifiedBenchEvalPipeline(PipelineABC):
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        super().__init__()
+        
+        self.storage = FileStorage(
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=MMLUPromptDIY(),
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            input_pred_key="generated_ans",
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.compile()
+    pl.forward()
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
new file mode 100644
index 00000000..18aad7d1
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
@@ -0,0 +1,84 @@
+from dataflow.pipeline.Pipeline import PipelineABC
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+EVAL_TYPE = "key3_q_a_rejected"
+KEY_MAPS = {
+    "context": "context", # optional
+    "question": "question",
+    "better": "better",
+    "rejected": "rejected"
+}
+
+class PreferencePairwisePromptDIY(DIYPromptABC):
+    def build_prompt(self, question: str = None, context: str = None, **kwargs):
+        ctx = f"Context:\n{context}\n\n" if context else ""
+        return f"{ctx}Question:\n{question}\n\nAnswer:"
+
+
+class UnifiedBenchEvalPipeline(PipelineABC):
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        super().__init__()
+        
+        self.storage = FileStorage(
+            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=PreferencePairwisePromptDIY(),
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=False,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            keys_map=KEY_MAPS,
+            context_key=None,
+            input_pred_key="generated_ans",
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.compile()
+    pl.forward()

From 2024daa61eb066c7bfacd3eb80d6946e844f302b Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Mon, 5 Jan 2026 23:29:41 +0800
Subject: [PATCH 3/6] fix bug in local llm serving when cleanup vllm but not
 started before

---
 dataflow/serving/local_model_llm_serving.py | 45 ++++++++++++++++-----
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/dataflow/serving/local_model_llm_serving.py b/dataflow/serving/local_model_llm_serving.py
index 74cc4e38..ad469d86 100644
--- a/dataflow/serving/local_model_llm_serving.py
+++ b/dataflow/serving/local_model_llm_serving.py
@@ -179,16 +179,34 @@ def generate_embedding_from_input(self, texts: list[str]) -> list[list[float]]:
         return [output.outputs.embedding for output in outputs]
 
     def cleanup(self):
-        free_mem = torch.cuda.mem_get_info()[0]  # 返回可用显存（单位：字节）
-        total_mem = torch.cuda.get_device_properties(0).total_memory
-        self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB")
-        self.logger.info("Cleaning up vLLM backend resources...")
         self.backend_initialized = False
+
+        if torch.cuda.is_available():
+            free_mem = torch.cuda.mem_get_info()[0]
+            total_mem = torch.cuda.get_device_properties(0).total_memory
+            self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB")
+
+        self.logger.info("Cleaning up vLLM backend resources...")
+
+        if not hasattr(self, "llm") or self.llm is None:
+            import gc
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            try:
+                import ray
+                ray.shutdown()
+            except Exception:
+                pass
+            return
+
         from vllm.distributed.parallel_state import (
             destroy_model_parallel,
             destroy_distributed_environment,
         )
-        del self.llm.llm_engine
+
+        if hasattr(self.llm, "llm_engine"):
+            del self.llm.llm_engine
         del self.llm
         destroy_model_parallel()
         destroy_distributed_environment()
@@ -196,13 +214,18 @@ def cleanup(self):
             torch.distributed.destroy_process_group()
         import gc
         gc.collect()
-        torch.cuda.empty_cache()
-        import ray
-        ray.shutdown()
-        free_mem = torch.cuda.mem_get_info()[0]  # 返回可用显存（单位：字节）
-        total_mem = torch.cuda.get_device_properties(0).total_memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        try:
+            import ray
+            ray.shutdown()
+        except Exception:
+            pass
 
-        self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB")
+        if torch.cuda.is_available():
+            free_mem = torch.cuda.mem_get_info()[0]
+            total_mem = torch.cuda.get_device_properties(0).total_memory
+            self.logger.info(f"Free memory: {free_mem / (1024 ** 2):.2f} MB / {total_mem / (1024 ** 2):.2f} MB")
             
 class LocalModelLLMServing_sglang(LLMServingABC):
     def __init__(

From 3d8472b58b7fd68ef9cfddb0305d94626744aa7d Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Mon, 5 Jan 2026 23:34:07 +0800
Subject: [PATCH 4/6] remove useless pl

---
 .../unified_bench_eval_pipeline.py            | 71 -------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py

diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py
deleted file mode 100644
index f18de41b..00000000
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_pipeline.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
-from dataflow.utils.storage import FileStorage
-from dataflow.serving import LocalModelLLMServing_vllm
-from dataflow.core import LLMServingABC
-
-"""
-all types:
-"key1_text_score",
-"key2_qa",
-"key2_q_ma",
-"key3_q_choices_a",
-"key3_q_choices_as",
-"key3_q_a_rejected",
-"""
-
-DIY_PROMPT_ANSWER = """Please output the answer."""
-EVAL_TYPE = "key1_text_score"
-KEY_MAPS = {"text": "text"}
-
-class UnifiedBenchEvalPipeline():
-    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
-        
-        self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl",
-            cache_path="./cache_local",
-            file_name_prefix="dataflow_cache_step",
-            cache_type="jsonl",
-        )
-
-        self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
-            vllm_tensor_parallel_size=1,
-            vllm_max_tokens=2048,
-        )
-
-        self.answer_generator_step1 = BenchAnswerGenerator(
-            llm_serving=self.llm_serving_generator,
-            eval_type=EVAL_TYPE,
-            prompt_template=None,
-            allow_overwrite=False,
-            force_generate=False,
-        )
-        
-        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
-            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
-            llm_serving=self.llm_serving_generator,
-            eval_type=EVAL_TYPE,
-            prompt_template=None,
-            use_semantic_judge=False,
-            metric_type=None,           # use default metric
-        )
-        
-    def forward(self):
-        self.answer_generator_step1.run(
-            storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
-            output_key="generated_ans",
-        )
-
-        self.evaluator_step2.run(
-            storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
-            input_pred_key="generated_ans",
-
-        )
-
-if __name__ == "__main__":
-    pl = UnifiedBenchEvalPipeline()
-    pl.forward()

From bdc5d87d2309a5bf8d62f1ad8dde24cf97bcc1d6 Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Tue, 6 Jan 2026 12:59:25 +0800
Subject: [PATCH 5/6] fix prompt_template bug in self test

---
 .../eval/unified_bench_dataset_evaluator.py   | 314 ++++++++++--------
 .../generate/bench_answer_generator.py        |  62 +++-
 .../unified_bench_eval_type1.py               |   8 +-
 .../unified_bench_eval_type2.py               |   8 +-
 .../unified_bench_eval_type3.py               |   8 +-
 .../unified_bench_eval_type4.py               |   8 +-
 .../unified_bench_eval_type5.py               |   8 +-
 .../unified_bench_eval_type6.py               |   8 +-
 8 files changed, 250 insertions(+), 174 deletions(-)

diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
index e0199dc6..1958a4a0 100644
--- a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
+++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import json
 import os
 import re
@@ -594,9 +592,13 @@ def _save_stats(self, bench_name_or_prefix: str, stats: Dict[str, Any]) -> None:
     def run(
         self,
         storage: DataFlowStorage,
-        keys_map: Optional[Dict[str, str]] = None,
-        context_key: Optional[str] = None,
+        input_keys_map: Optional[Dict[str, str]] = None,
+        input_context_key: Optional[str] = None,
         input_pred_key: str = "generated_ans",
+        output_eval_valid_key: str = "eval_valid",
+        output_eval_error_key: str = "eval_error",
+        output_eval_pred_key: str = "eval_pred",
+        output_eval_score_key: str = "eval_score",
     ) -> List[str]:
         """
         keys_map 示例：
@@ -611,43 +613,43 @@ def run(
         eval_type = self.eval_type
 
         # 输出列统一
-        if "eval_valid" not in df.columns:
-            df["eval_valid"] = True
-        df["eval_error"] = ""
-        df["eval_pred"] = None
-        df["eval_score"] = np.nan  # 数值型评分（accuracy 类用 0/1）
+        if output_eval_valid_key not in df.columns:
+            df[output_eval_valid_key] = True
+        df[output_eval_error_key] = ""
+        df[output_eval_pred_key] = None
+        df[output_eval_score_key] = np.nan  # 数值型评分（accuracy 类用 0/1）
 
         # 默认 metric
         metric_type = self.metric_type
         if metric_type is None:
             metric_type = self._default_metric_for_type(eval_type, self.use_semantic_judge)
 
-        if keys_map is None:
+        if input_keys_map is None:
             self.logger.error("keys_map is required.")
             storage.write(df)
-            return ["eval_valid", "eval_error", "eval_pred", "eval_score"]
+            return [output_eval_valid_key, output_eval_error_key, output_eval_pred_key, output_eval_score_key]
 
         # context 处理：统一读一列（可无）
         ctx_series = None
-        if context_key is not None:
-            if context_key not in df.columns:
-                self.logger.error(f"context_key '{context_key}' not found; treat as None.")
+        if input_context_key is not None:
+            if input_context_key not in df.columns:
+                self.logger.error(f"context_key '{input_context_key}' not found; treat as None.")
             else:
-                ctx_series = df[context_key]
+                ctx_series = df[input_context_key]
 
         # 分发
         if eval_type == "key1_text_score":
-            required = [keys_map.get("text", "")]
+            required = [input_keys_map.get("text", "")]
             if not self._check_columns(df, required):
                 storage.write(df)
                 return required
 
-            text_col = keys_map["text"]
+            text_col = input_keys_map["text"]
             texts = [str(x) if x is not None else "" for x in df[text_col].tolist()]
             ppl = self._ppl_batch(texts)
             if ppl is None:
-                df["eval_valid"] = False
-                df["eval_error"] = "ppl_unavailable"
+                df[output_eval_valid_key] = False
+                df[output_eval_error_key] = "ppl_unavailable"
                 storage.write(df)
                 self._save_stats(storage.file_name_prefix, {
                     "bench_name_or_prefix": storage.file_name_prefix,
@@ -657,11 +659,11 @@ def run(
                     "valid_samples": 0,
                     "note": "ppl unavailable in llm_serving",
                 })
-                return [text_col, "eval_score", "eval_valid", "eval_error"]
+                return [text_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
-            df["eval_score"] = ppl
-            df["eval_pred"] = None
-            df["eval_valid"] = True
+            df[output_eval_score_key] = ppl
+            df[output_eval_pred_key] = None
+            df[output_eval_valid_key] = True
             storage.write(df)
 
             stats = {
@@ -673,15 +675,15 @@ def run(
                 "ppl_mean": float(np.mean(ppl)) if len(ppl) else 0.0,
             }
             self._save_stats(storage.file_name_prefix, stats)
-            return [text_col, "eval_score", "eval_valid", "eval_error"]
+            return [text_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
         elif eval_type in ("key2_qa", "key2_q_ma"):
             # QA：默认走 math_verify 抽取+对比（可选 semantic_judge）
             # 单参考：target
             # 多参考：targets
-            question_col = keys_map.get("question", "")
+            question_col = input_keys_map.get("question", "")
             if eval_type == "key2_qa":
-                target_col = keys_map.get("target", "")
+                target_col = input_keys_map.get("target", "")
                 required = [question_col, target_col, input_pred_key]
                 if not self._check_columns(df, required):
                     storage.write(df)
@@ -704,10 +706,10 @@ def run(
                     "metric": metric_type,
                 })
                 self._save_stats(storage.file_name_prefix, stats)
-                return [question_col, target_col, input_pred_key, "eval_score", "eval_valid", "eval_error"]
+                return [question_col, target_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
             else:
-                targets_col = keys_map.get("targets", "")
+                targets_col = input_keys_map.get("targets", "")
                 required = [question_col, targets_col, input_pred_key]
                 if not self._check_columns(df, required):
                     storage.write(df)
@@ -730,12 +732,12 @@ def run(
                     "metric": metric_type,
                 })
                 self._save_stats(storage.file_name_prefix, stats)
-                return [question_col, targets_col, input_pred_key, "eval_score", "eval_valid", "eval_error"]
+                return [question_col, targets_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] 
 
         elif eval_type == "key3_q_choices_a":
-            question_col = keys_map.get("question", "")
-            choices_col = keys_map.get("choices", "")
-            label_col = keys_map.get("label", "")
+            question_col = input_keys_map.get("question", "")
+            choices_col = input_keys_map.get("choices", "")
+            label_col = input_keys_map.get("label", "")
             required = [question_col, choices_col, label_col]
             # 若没有 llm_serving，则 fallback 需要 pred_col
             if self.llm_serving is None:
@@ -763,12 +765,12 @@ def run(
                 "metric": metric_type,
             })
             self._save_stats(storage.file_name_prefix, stats)
-            return [question_col, choices_col, label_col, "eval_score", "eval_valid", "eval_error"]
+            return [question_col, choices_col, label_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
         elif eval_type == "key3_q_choices_as":
-            question_col = keys_map.get("question", "")
-            choices_col = keys_map.get("choices", "")
-            labels_col = keys_map.get("labels", "")
+            question_col = input_keys_map.get("question", "")
+            choices_col = input_keys_map.get("choices", "")
+            labels_col = input_keys_map.get("labels", "")
             required = [question_col, choices_col, labels_col, input_pred_key]  # 先按“解析模型输出集合”实现
             if not self._check_columns(df, required):
                 storage.write(df)
@@ -791,12 +793,12 @@ def run(
                 "metric": metric_type,
             })
             self._save_stats(storage.file_name_prefix, stats)
-            return [question_col, choices_col, labels_col, input_pred_key, "eval_score", "eval_valid", "eval_error"]
+            return [question_col, choices_col, labels_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
         elif eval_type == "key3_q_a_rejected":
-            question_col = keys_map.get("question", "")
-            better_col = keys_map.get("better", "")
-            rejected_col = keys_map.get("rejected", "")
+            question_col = input_keys_map.get("question", "")
+            better_col = input_keys_map.get("better", "")
+            rejected_col = input_keys_map.get("rejected", "")
             required = [question_col, better_col, rejected_col]
             if not self._check_columns(df, required):
                 storage.write(df)
@@ -804,8 +806,9 @@ def run(
 
             if self.llm_serving is None:
                 # 这个类型没有 pred_col 可 fallback，只能报错
-                df["eval_valid"] = False
-                df["eval_error"] = "llm_serving_required_for_pairwise"
+                self.logger.error("llm_serving is required for pairwise evaluation")
+                df[output_eval_valid_key] = False
+                df[output_eval_error_key] = "llm_serving_required_for_pairwise"
                 storage.write(df)
                 stats = {
                     "bench_name_or_prefix": storage.file_name_prefix,
@@ -816,7 +819,7 @@ def run(
                     "note": "pairwise requires llm_serving loglikelihood",
                 }
                 self._save_stats(storage.file_name_prefix, stats)
-                return required + ["eval_score", "eval_valid", "eval_error"]
+                return required + [output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
             self._eval_pairwise(
                 df=df,
@@ -835,12 +838,12 @@ def run(
                 "metric": metric_type,
             })
             self._save_stats(storage.file_name_prefix, stats)
-            return required + ["eval_score", "eval_valid", "eval_error"]
+            return required + [output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
         else:
             self.logger.error(f"Unknown bench_dataflow_eval_type: {eval_type}")
             storage.write(df)
-            return ["eval_valid", "eval_error", "eval_pred", "eval_score"]
+            return [output_eval_valid_key, output_eval_error_key, input_pred_key, output_eval_score_key]
 
     # -----------------------------
     # 默认 metric
@@ -865,11 +868,11 @@ def _default_metric_for_type(self, t: str, use_semantic_judge: bool) -> str:
     # -----------------------------
     def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]:
         total = len(df)
-        valid_mask = df["eval_valid"] == True
+        valid_mask = df[output_eval_valid_key] == True
         valid = int(valid_mask.sum())
         # eval_score: 0/1
         if valid > 0:
-            acc = float(df.loc[valid_mask, "eval_score"].mean())
+            acc = float(df.loc[valid_mask, output_eval_score_key].mean())
         else:
             acc = 0.0
         return {
@@ -883,11 +886,11 @@ def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]:
     # -----------------------------
     def _stats_for_multiselect(self, df: pd.DataFrame) -> Dict[str, Any]:
         total = len(df)
-        valid_mask = df["eval_valid"] == True
+        valid_mask = df[output_eval_valid_key] == True
         valid = int(valid_mask.sum())
         # eval_score 默认存 f1
         if valid > 0:
-            f1_mean = float(df.loc[valid_mask, "eval_score"].mean())
+            f1_mean = float(df.loc[valid_mask, output_eval_score_key].mean())   
         else:
             f1_mean = 0.0
         # 如果你想要更多维度（jaccard/exact_set），可以从 eval_pred 里扩展存 dict，这里先给最小
@@ -913,8 +916,8 @@ def _eval_qa_single(
             # 语义 judge 需要 llm_serving.generate_from_input
             if self.llm_serving is None or not hasattr(self.llm_serving, "generate_from_input"):
                 self.logger.error("semantic_judge requires llm_serving.generate_from_input")
-                df["eval_valid"] = False
-                df["eval_error"] = "semantic_judge_unavailable"
+                df[output_eval_valid_key] = False
+                df[output_eval_error_key] = "semantic_judge_unavailable"
                 return
 
             # 默认用“预测 vs 标准”直接 judge（这里只做通用；可自行替换 AnswerJudgePrompt）
@@ -924,12 +927,12 @@ def _eval_qa_single(
                 gt = row[target_col]
                 pred = row[pred_col]
                 if gt is None or (isinstance(gt, str) and gt.strip() == ""):
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "empty_reference"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "empty_reference"
                     continue
                 if pred is None or (isinstance(pred, str) and pred.strip() == ""):
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "empty_prediction"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "empty_prediction"
                     continue
 
                 prompt = (
@@ -948,16 +951,16 @@ def _eval_qa_single(
             except Exception as e:
                 self.logger.error(f"semantic_judge generate_from_input failed: {e}")
                 for idx in row_indices:
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "semantic_judge_failed"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "semantic_judge_failed"
                 return
 
             for idx, resp in zip(row_indices, responses):
                 ok = self._resolve_judge_response(resp)
-                df.at[idx, "eval_score"] = 1.0 if ok else 0.0
-                df.at[idx, "eval_pred"] = None
-                df.at[idx, "eval_valid"] = True
-                df.at[idx, "eval_error"] = ""
+                df.at[idx, output_eval_score_key] = 1.0 if ok else 0.0
+                df.at[idx, output_eval_pred_key] = None
+                df.at[idx, output_eval_valid_key] = True    
+                df.at[idx, output_eval_error_key] = ""
 
             return
 
@@ -966,22 +969,22 @@ def _eval_qa_single(
             gt = row[target_col]
             pred_raw = row[pred_col]
             if gt is None or (isinstance(gt, str) and gt.strip() == ""):
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_reference"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_reference"
                 continue
             if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""):
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_prediction"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_prediction"
                 continue
 
             final_answer = self.answer_extractor.extract_answer(pred_raw, None)
             text_ok = self._text_contains_match(pred_raw, gt) or self._text_contains_match(final_answer, gt)
             math_res = self._try_math_verify_compare(final_answer, gt)
             ok = text_ok or (math_res is True)
-            df.at[idx, "eval_score"] = 1.0 if ok else 0.0
-            df.at[idx, "eval_pred"] = str(final_answer) if (math_res is True) else str(pred_raw)
-            df.at[idx, "eval_valid"] = True
-            df.at[idx, "eval_error"] = ""
+            df.at[idx, output_eval_score_key] = 1.0 if ok else 0.0
+            df.at[idx, output_eval_pred_key] = str(final_answer) if (math_res is True) else str(pred_raw)
+            df.at[idx, output_eval_valid_key] = True
+            df.at[idx, output_eval_error_key] = ""
 
     # -----------------------------
     # key2_q_ma：多参考
@@ -1002,12 +1005,12 @@ def _eval_qa_multi(
             targets = self._normalize_targets(targets_raw)
 
             if len(targets) == 0:
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_references"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_references"
                 continue
             if pred_raw is None or (isinstance(pred_raw, str) and pred_raw.strip() == ""):
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_prediction"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_prediction"
                 continue
 
             final_answer = self.answer_extractor.extract_answer(pred_raw, None)
@@ -1021,10 +1024,10 @@ def _eval_qa_multi(
                     matched_by_text = matched_by_text or text_ok
                     break
 
-            df.at[idx, "eval_score"] = 1.0 if ok_any else 0.0
-            df.at[idx, "eval_pred"] = str(pred_raw) if matched_by_text else str(final_answer)
-            df.at[idx, "eval_valid"] = True
-            df.at[idx, "eval_error"] = ""
+            df.at[idx, output_eval_score_key] = 1.0 if ok_any else 0.0
+            df.at[idx, output_eval_pred_key] = str(pred_raw) if matched_by_text else str(final_answer)
+            df.at[idx, output_eval_valid_key] = True
+            df.at[idx, output_eval_error_key] = ""  
 
     # -----------------------------
     # key3_q_choices_a：单选
@@ -1048,20 +1051,20 @@ def _eval_mc_single(
                 label = row[label_col]
 
                 if choices is None or (isinstance(choices, float) and np.isnan(choices)):
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "empty_choices"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "empty_choices"
                     continue
                 if not isinstance(choices, list):
                     # 尝试 json
                     try:
                         choices = json.loads(str(choices))
                     except Exception:
-                        df.at[idx, "eval_valid"] = False
-                        df.at[idx, "eval_error"] = "choices_not_list"
+                        df.at[idx, output_eval_valid_key] = False
+                        df.at[idx, output_eval_error_key] = "choices_not_list"
                         continue
                 if len(choices) == 0:
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "empty_choices"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "empty_choices"
                     continue
 
                 ctx = None
@@ -1073,8 +1076,8 @@ def _eval_mc_single(
                 # label 规范化为 idx
                 gold_idx = self._normalize_label_to_index(label, len(choices))
                 if gold_idx is None:
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "invalid_label"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "invalid_label"
                     continue
 
                 prompts = [prompt] * len(choices)
@@ -1086,15 +1089,15 @@ def _eval_mc_single(
 
                 lls = self._ll_batch(prompts, conts)
                 if lls is None:
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "ll_unavailable"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "ll_unavailable"
                     continue
 
                 pred_idx = int(np.argmax(np.array(lls)))
-                df.at[idx, "eval_pred"] = int(pred_idx)
-                df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0
-                df.at[idx, "eval_valid"] = True
-                df.at[idx, "eval_error"] = ""
+                df.at[idx, output_eval_pred_key] = int(pred_idx)
+                df.at[idx, output_eval_score_key] = 1.0 if pred_idx == gold_idx else 0.0
+                df.at[idx, output_eval_valid_key] = True
+                df.at[idx, output_eval_error_key] = ""
 
             return
 
@@ -1106,28 +1109,28 @@ def _eval_mc_single(
             pred_text = row[pred_col] if pred_col in df.columns else None
 
             if choices is None:
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_choices"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_choices"
                 continue
             if not isinstance(choices, list):
                 try:
                     choices = json.loads(str(choices))
                 except Exception:
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "choices_not_list"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "choices_not_list"
                     continue
 
             gold_idx = self._normalize_label_to_index(label, len(choices))
             pred_idx = self._parse_choice_from_text(str(pred_text), len(choices)) if pred_text is not None else None
             if gold_idx is None or pred_idx is None:
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "parse_failed"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "parse_failed"
                 continue
 
-            df.at[idx, "eval_pred"] = int(pred_idx)
-            df.at[idx, "eval_score"] = 1.0 if pred_idx == gold_idx else 0.0
-            df.at[idx, "eval_valid"] = True
-            df.at[idx, "eval_error"] = ""
+            df.at[idx, output_eval_pred_key] = int(pred_idx)
+            df.at[idx, output_eval_score_key] = 1.0 if pred_idx == gold_idx else 0.0
+            df.at[idx, output_eval_valid_key] = True
+            df.at[idx, output_eval_error_key] = ""
 
     def _normalize_label_to_index(self, label: Any, n: int) -> Optional[int]:
         if label is None:
@@ -1175,15 +1178,15 @@ def _eval_mc_multi(
             pred_text = row[pred_col]
 
             if choices is None:
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_choices"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_choices"
                 continue
             if not isinstance(choices, list):
                 try:
                     choices = json.loads(str(choices))
                 except Exception:
-                    df.at[idx, "eval_valid"] = False
-                    df.at[idx, "eval_error"] = "choices_not_list"
+                    df.at[idx, output_eval_valid_key] = False
+                    df.at[idx, output_eval_error_key] = "choices_not_list"
                     continue
 
             n = len(choices)
@@ -1191,20 +1194,20 @@ def _eval_mc_multi(
             pred_set = self._parse_multiselect_set(str(pred_text), n)
 
             if gold_set is None or pred_set is None:
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "parse_failed"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "parse_failed"
                 continue
 
             m = self._set_metrics(pred_set, gold_set)
             # eval_score 默认存 f1（你的上层聚合最常用）
-            df.at[idx, "eval_score"] = float(m["f1"])
+            df.at[idx, output_eval_score_key] = float(m["f1"])
             # eval_pred 存更丰富的信息，便于 debug
-            df.at[idx, "eval_pred"] = json.dumps(
+            df.at[idx, output_eval_pred_key] = json.dumps(
                 {"pred_set": sorted(list(pred_set)), "gold_set": sorted(list(gold_set)), **m},
                 ensure_ascii=False,
             )
-            df.at[idx, "eval_valid"] = True
-            df.at[idx, "eval_error"] = ""
+            df.at[idx, output_eval_valid_key] = True
+            df.at[idx, output_eval_error_key] = ""
 
     def _normalize_multilabel_to_set(self, labels: Any, n: int) -> Optional[set]:
         if labels is None:
@@ -1268,12 +1271,12 @@ def _eval_pairwise(
             rej = row[rejected_col]
 
             if better is None or (isinstance(better, str) and better.strip() == ""):
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_better"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_better"
                 continue
             if rej is None or (isinstance(rej, str) and rej.strip() == ""):
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "empty_rejected"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "empty_rejected"
                 continue
 
             ctx = None
@@ -1291,15 +1294,15 @@ def _eval_pairwise(
 
             lls = self._ll_batch(prompts, conts)
             if lls is None or len(lls) != 2:
-                df.at[idx, "eval_valid"] = False
-                df.at[idx, "eval_error"] = "ll_unavailable"
+                df.at[idx, output_eval_valid_key] = False
+                df.at[idx, output_eval_error_key] = "ll_unavailable"
                 continue
 
             win = 1.0 if float(lls[0]) > float(lls[1]) else 0.0
-            df.at[idx, "eval_score"] = win
-            df.at[idx, "eval_pred"] = json.dumps({"ll_better": float(lls[0]), "ll_rejected": float(lls[1])}, ensure_ascii=False)
-            df.at[idx, "eval_valid"] = True
-            df.at[idx, "eval_error"] = ""
+            df.at[idx, output_eval_score_key] = win
+            df.at[idx, output_eval_pred_key] = json.dumps({"ll_better": float(lls[0]), "ll_rejected": float(lls[1])}, ensure_ascii=False)
+            df.at[idx, output_eval_valid_key] = True
+            df.at[idx, output_eval_error_key] = ""
 
     # -----------------------------
     # 语义 judge 响应解析（兼容你旧逻辑）
@@ -1335,17 +1338,62 @@ def _resolve_judge_response(self, response: Any) -> bool:
     def get_desc(lang: str = "zh"):
         if lang == "zh":
             return (
-                "统一 Bench 评测算子：支持 6 类纯文本评测范式。\n\n"
-                "支持类型：\n"
-                "- key1_text_score（默认 ppl）\n"
-                "- key2_qa（默认 math_verify / 可选 semantic_judge）\n"
-                "- key2_q_ma（默认 any_math_verify）\n"
-                "- key3_q_choices_a（默认 ll_choice_acc，若无 ll 接口则 fallback 解析生成）\n"
-                "- key3_q_choices_as（默认 micro_f1：解析多选集合后算 F1）\n"
-                "- key3_q_a_rejected（默认 pairwise_ll_winrate）\n\n"
-                "统一输出列：eval_score / eval_pred / eval_valid / eval_error，并支持统计落盘。"
+                "该算子用于统一 Bench 评测，支持多种任务范式并将评测结果写回 DataFrame，同时输出整体统计到 eval_result_path。\n\n"
+                "支持类型与默认 metric：\n"
+                "- key1_text_score：ppl\n"
+                "- key2_qa：math_verify（或 use_semantic_judge=True 时 semantic_judge）\n"
+                "- key2_q_ma：any_math_verify（多参考）\n"
+                "- key3_q_choices_a：ll_choice_acc（基于 loglikelihood；无 serving 接口时使用 HF forward 计算 ll）\n"
+                "- key3_q_choices_as：micro_f1（解析多选集合后计算）\n"
+                "- key3_q_a_rejected：pairwise_ll_winrate（基于 ll 比较 better vs rejected）\n\n"
+                "初始化参数：\n"
+                "- eval_result_path：统计结果落盘路径\n"
+                "- eval_type：评测类型（同上）\n"
+                "- llm_serving：可选；用于 semantic_judge 或提供模型路径信息以进行 PPL/LL 的 HF 计算\n"
+                "- prompt_template：提示模板对象（可选；需提供 build_prompt；默认使用 AnswerJudgePrompt）\n"
+                "- system_prompt：语义评测/judge 的系统提示词\n"
+                "- metric_type：可选；不传则使用 eval_type 的默认 metric\n"
+                "- use_semantic_judge：仅对 key2_qa 有效；是否使用语义评测\n\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage\n"
+                "- input_keys_map：字段映射（不同 eval_type 需要不同 key：text/question/target/targets/choices/label/labels/better/rejected）\n"
+                "- input_context_key：可选，上下文字段名\n"
+                "- input_pred_key：预测答案字段名（默认 generated_ans）\n\n"
+                "输出：\n"
+                "- output_eval_score_key（数值分数）\n"
+                "- output_eval_pred_key（解析后的预测）\n"
+                "- output_eval_valid_key（是否有效）\n"
+                "- output_eval_error_key（错误信息）\n"
+                "- 保存统计：total_samples/valid_samples/accuracy 或 ppl_mean 等到 eval_result_path\n"
+                "- 返回本次评测涉及/产出的列名列表"
             )
         return (
-            "Unified bench evaluator supporting 6 text-only task archetypes.\n"
-            "Outputs: eval_score / eval_pred / eval_valid / eval_error with stats saved."
-        )
+            "This operator evaluates unified bench datasets across multiple task archetypes. It writes per-sample results back to the dataframe and saves aggregated statistics to eval_result_path.\n\n"
+            "Supported Types (default metric):\n"
+            "- key1_text_score (ppl)\n"
+            "- key2_qa (math_verify or semantic_judge)\n"
+            "- key2_q_ma (any_math_verify)\n"
+            "- key3_q_choices_a (ll_choice_acc)\n"
+            "- key3_q_choices_as (micro_f1)\n"
+            "- key3_q_a_rejected (pairwise_ll_winrate)\n\n"
+            "Input Parameters:\n"
+            "- eval_result_path: Path to save aggregated statistics\n"
+            "- eval_type: Evaluation type (one of the supported types)\n"
+            "- llm_serving: Optional; required for semantic_judge and used as model source for HF-based PPL/LL computation\n"
+            "- prompt_template: Prompt template object (optional; must provide build_prompt; default is AnswerJudgePrompt)\n"
+            "- system_prompt: System prompt for semantic judging\n"
+            "- metric_type: Optional; overrides the default metric for the given eval_type\n"
+            "- use_semantic_judge: Only for key2_qa; whether to use LLM-based semantic judging\n\n"
+            "Run Parameters:\n"
+            "- storage: DataFlowStorage\n"
+            "- keys_map: Column mapping; depends on eval_type (text/question/target/targets/choices/label/labels/better/rejected)\n"
+            "- context_key: Optional context column name\n"
+            "- input_pred_key: Prediction column name (default: generated_ans)\n\n"
+            "Output Parameters:\n"
+            f"- output_eval_score_key: Numeric score (accuracy classes use 0/1)\n"
+            f"- output_eval_pred_key: Parsed prediction\n"
+            f"- output_eval_valid_key: Whether the sample is valid\n"
+            f"- output_eval_error_key: Error message if any\n"
+            "- Saves aggregated stats to eval_result_path\n"
+            "- Returns a list of involved/output keys"
+        )
\ No newline at end of file
diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py
index bcc619fe..796c2033 100644
--- a/dataflow/operators/core_text/generate/bench_answer_generator.py
+++ b/dataflow/operators/core_text/generate/bench_answer_generator.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import json
 import inspect
 import re
@@ -41,11 +39,10 @@ def __init__(
                 "key3_q_choices_as",
                 "key3_q_a_rejected",
             ] = "key2_qa",
-        llm_serving: Optional[LLMServingABC] = None,
-        prompt_template: Optional[Union[DIYPromptABC, Any]] = None,
+        llm_serving: LLMServingABC = None,
+        prompt_template: DIYPromptABC = None,
         system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.",
         allow_overwrite: bool = False,
-        # 是否强制对所有类型都生成, 默认只对需要 pred 的类型生成
         force_generate: bool = False,
     ):
         self.logger = get_logger()
@@ -189,8 +186,8 @@ def _need_generation(self, eval_type: str) -> bool:
     def run(
         self,
         storage: DataFlowStorage,
-        keys_map: Dict[str, str],
-        context_key: Optional[str] = None,
+        input_keys_map: Dict[str, str],
+        input_context_key: Optional[str] = None,
         output_key: str = "generated_ans",
     ) -> List[str]:
 
@@ -208,13 +205,13 @@ def run(
             return []
 
         # 读取字段
-        q_col = keys_map.get("question")
+        q_col = input_keys_map.get("question")
         if not q_col or q_col not in df.columns:
             self.logger.error(f"缺少 question 列, keys_map.question={q_col}")
             storage.write(df)
             return []
 
-        ch_col = keys_map.get("choices")
+        ch_col = input_keys_map.get("choices")
         need_choices = eval_type in ("key3_q_choices_a", "key3_q_choices_as")
         if need_choices and (not ch_col or ch_col not in df.columns):
             self.logger.error(f"缺少 choices 列, keys_map.choices={ch_col}")
@@ -222,11 +219,11 @@ def run(
             return []
 
         ctx_series = None
-        if context_key:
-            if context_key in df.columns:
-                ctx_series = df[context_key]
+        if input_context_key:
+            if input_context_key in df.columns:
+                ctx_series = df[input_context_key]
             else:
-                self.logger.error(f"context_key 不存在: {context_key}, 视为 None")
+                self.logger.error(f"context_key 不存在: {input_context_key}, 视为 None")
 
         prompts: List[str] = []
         for idx, row in df.iterrows():
@@ -259,8 +256,39 @@ def run(
     def get_desc(lang: str = "zh"):
         if lang == "zh":
             return (
-                "用于 bench 评测的统一生成算子, 与 evaluator 的 eval_type + keys_map 对齐。\n"
-                "默认只对需要生成输出的类型生成 output_key=generated_ans, 并支持 context_key 作为可选上下文。\n"
-                "可通过 allow_overwrite 控制是否覆盖已存在的输出列。"
+                "该算子用于 bench 评测的统一答案生成，根据 eval_type + keys_map 从 DataFrame 取字段构造 prompt 并批量调用 LLM 生成答案。\n"
+                "对于默认不需要生成的类型会跳过生成（可用 force_generate 强制）。\n\n"
+                "初始化参数：\n"
+                "- eval_type：评测类型（key1_text_score / key2_qa / key2_q_ma / key3_q_choices_a / key3_q_choices_as / key3_q_a_rejected）\n"
+                "- llm_serving：LLM 服务对象（需提供 generate_from_input）\n"
+                "- prompt_template：提示模板对象（可选，需提供 build_prompt；否则使用内置 fallback 模板）\n"
+                "- system_prompt：系统提示词\n"
+                "- allow_overwrite：输出列已存在时是否允许覆盖\n"
+                "- force_generate：是否强制对可生成类型都生成\n\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage\n"
+                "- input_keys_map：字段映射，至少包含 question；选择题需包含 choices\n"
+                "- input_context_key：可选，上下文字段名\n"
+                "- output_key：生成结果列名（默认 generated_ans）\n\n"
+                "输出：\n"
+                "- 写回 DataFrame 的 output_key 列（若跳过生成则不写）\n"
+                "- 返回新增/写入的列名列表（通常为 [output_key] 或 []）"
             )
-        return "Unified bench answer generator aligned with evaluator eval_type and keys_map."
+        return (
+            "This operator generates answers for unified bench evaluation by building prompts from a dataframe and calling an LLM.\n\n"
+            "Input Parameters:\n"
+            "- eval_type: Evaluation type (key1_text_score/key2_qa/key2_q_ma/key3_q_choices_a/key3_q_choices_as/key3_q_a_rejected)\n"
+            "- llm_serving: LLM serving object (must provide generate_from_input)\n"
+            "- prompt_template: Prompt template object (optional; must provide build_prompt; falls back to an internal template)\n"
+            "- system_prompt: System prompt passed to the serving (if supported)\n"
+            "- allow_overwrite: Whether to overwrite an existing output column\n"
+            "- force_generate: Whether to force generation for types that can be skipped by default\n\n"
+            "Run Parameters:\n"
+            "- storage: DataFlowStorage\n"
+            "- keys_map: Column mapping (requires question; for choice tasks requires choices)\n"
+            "- context_key: Optional context column name\n"
+            "- output_key: Output column name for generated answers (default: generated_ans)\n\n"
+            "Output Parameters:\n"
+            "- Writes output_key into the dataframe when generation is performed\n"
+            "- Returns a list of written keys (usually [output_key] or [])"
+        )
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
index f18de41b..13941c02 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
@@ -53,15 +53,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             input_pred_key="generated_ans",
 
         )
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
index 14271498..55f580df 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
@@ -64,15 +64,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
index e69fca06..6992e080 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
@@ -67,15 +67,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
index d522f265..c84ccb4a 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
@@ -66,15 +66,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
index 90ca3f05..461c65b9 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
@@ -66,15 +66,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
index 18aad7d1..a5ff886c 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
@@ -66,15 +66,15 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            keys_map=KEY_MAPS,
-            context_key=None,
+            input_keys_map=KEY_MAPS,
+            input_context_key=None,
             input_pred_key="generated_ans",
         )
 

From 83abcbbbbb39092795c6f5f11ca2d759b60ce498 Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Thu, 8 Jan 2026 22:36:45 +0800
Subject: [PATCH 6/6] using formatstrprompt op instead of coding string prompt
 in pipeline.py

---
 .../eval/unified_bench_dataset_evaluator.py   | 116 +++++++++++++-----
 .../generate/bench_answer_generator.py        |  74 ++++++++---
 .../unified_bench_eval_type1.py               |  11 +-
 .../unified_bench_eval_type2.py               |  29 ++---
 .../unified_bench_eval_type3.py               |  34 +++--
 .../unified_bench_eval_type4.py               |  35 +++---
 .../unified_bench_eval_type5.py               |  35 +++---
 .../unified_bench_eval_type6.py               |  34 +++--
 .../unified_bench_eval_type_semantic.py       |  83 +++++++++++++
 9 files changed, 299 insertions(+), 152 deletions(-)
 create mode 100644 dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py

diff --git a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
index 1958a4a0..c90d721c 100644
--- a/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
+++ b/dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py
@@ -41,7 +41,7 @@ class UnifiedBenchDatasetEvaluator(OperatorABC):
       - key3_q_a_rejected
 
     核心思想：
-      只需要传 bench_dataflow_eval_type + metric_type + keys_map + (可选) context_key
+      只需要传 bench_dataflow_eval_type + metric_type + input_xxx_key + (可选) context_key
       - evaluator 内部负责：
           1) 读取 dataframe
           2) 取 keys
@@ -592,7 +592,15 @@ def _save_stats(self, bench_name_or_prefix: str, stats: Dict[str, Any]) -> None:
     def run(
         self,
         storage: DataFlowStorage,
-        input_keys_map: Optional[Dict[str, str]] = None,
+        input_text_key: Optional[str] = None,
+        input_question_key: Optional[str] = None,
+        input_target_key: Optional[str] = None,
+        input_targets_key: Optional[str] = None,
+        input_choices_key: Optional[str] = None,
+        input_label_key: Optional[str] = None,
+        input_labels_key: Optional[str] = None,
+        input_better_key: Optional[str] = None,
+        input_rejected_key: Optional[str] = None,
         input_context_key: Optional[str] = None,
         input_pred_key: str = "generated_ans",
         output_eval_valid_key: str = "eval_valid",
@@ -601,17 +609,22 @@ def run(
         output_eval_score_key: str = "eval_score",
     ) -> List[str]:
         """
-        keys_map 示例：
-          - key1_text_score: {"text": "text"}
-          - key2_qa: {"question":"question", "target":"golden_answer"}
-          - key2_q_ma: {"question":"question", "targets":"gold_answers"}
-          - key3_q_choices_a: {"question":"question", "choices":"choices", "label":"label"}
-          - key3_q_choices_as: {"question":"question", "choices":"choices", "labels":"labels"}
-          - key3_q_a_rejected: {"question":"question", "better":"chosen", "rejected":"rejected"}
+        字段列名通过 input_xxx_key 显式传入（未传默认 None）：
+          - key1_text_score: input_text_key
+          - key2_qa: input_question_key + input_target_key
+          - key2_q_ma: input_question_key + input_targets_key
+          - key3_q_choices_a: input_question_key + input_choices_key + input_label_key
+          - key3_q_choices_as: input_question_key + input_choices_key + input_labels_key
+          - key3_q_a_rejected: input_question_key + input_better_key + input_rejected_key
         """
         df = storage.read("dataframe")
         eval_type = self.eval_type
 
+        self.output_eval_valid_key = output_eval_valid_key
+        self.output_eval_error_key = output_eval_error_key
+        self.output_eval_pred_key = output_eval_pred_key
+        self.output_eval_score_key = output_eval_score_key
+
         # 输出列统一
         if output_eval_valid_key not in df.columns:
             df[output_eval_valid_key] = True
@@ -624,10 +637,6 @@ def run(
         if metric_type is None:
             metric_type = self._default_metric_for_type(eval_type, self.use_semantic_judge)
 
-        if input_keys_map is None:
-            self.logger.error("keys_map is required.")
-            storage.write(df)
-            return [output_eval_valid_key, output_eval_error_key, output_eval_pred_key, output_eval_score_key]
 
         # context 处理：统一读一列（可无）
         ctx_series = None
@@ -639,12 +648,12 @@ def run(
 
         # 分发
         if eval_type == "key1_text_score":
-            required = [input_keys_map.get("text", "")]
+            text_col = input_text_key or ""
+            required = [text_col]
             if not self._check_columns(df, required):
                 storage.write(df)
                 return required
 
-            text_col = input_keys_map["text"]
             texts = [str(x) if x is not None else "" for x in df[text_col].tolist()]
             ppl = self._ppl_batch(texts)
             if ppl is None:
@@ -681,9 +690,9 @@ def run(
             # QA：默认走 math_verify 抽取+对比（可选 semantic_judge）
             # 单参考：target
             # 多参考：targets
-            question_col = input_keys_map.get("question", "")
+            question_col = input_question_key or ""
             if eval_type == "key2_qa":
-                target_col = input_keys_map.get("target", "")
+                target_col = input_target_key or ""
                 required = [question_col, target_col, input_pred_key]
                 if not self._check_columns(df, required):
                     storage.write(df)
@@ -709,7 +718,7 @@ def run(
                 return [question_col, target_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
             else:
-                targets_col = input_keys_map.get("targets", "")
+                targets_col = input_targets_key or ""
                 required = [question_col, targets_col, input_pred_key]
                 if not self._check_columns(df, required):
                     storage.write(df)
@@ -735,9 +744,9 @@ def run(
                 return [question_col, targets_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key] 
 
         elif eval_type == "key3_q_choices_a":
-            question_col = input_keys_map.get("question", "")
-            choices_col = input_keys_map.get("choices", "")
-            label_col = input_keys_map.get("label", "")
+            question_col = input_question_key or ""
+            choices_col = input_choices_key or ""
+            label_col = input_label_key or ""
             required = [question_col, choices_col, label_col]
             # 若没有 llm_serving，则 fallback 需要 pred_col
             if self.llm_serving is None:
@@ -768,9 +777,9 @@ def run(
             return [question_col, choices_col, label_col, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
         elif eval_type == "key3_q_choices_as":
-            question_col = input_keys_map.get("question", "")
-            choices_col = input_keys_map.get("choices", "")
-            labels_col = input_keys_map.get("labels", "")
+            question_col = input_question_key or ""
+            choices_col = input_choices_key or ""
+            labels_col = input_labels_key or ""
             required = [question_col, choices_col, labels_col, input_pred_key]  # 先按“解析模型输出集合”实现
             if not self._check_columns(df, required):
                 storage.write(df)
@@ -796,9 +805,9 @@ def run(
             return [question_col, choices_col, labels_col, input_pred_key, output_eval_score_key, output_eval_valid_key, output_eval_error_key]
 
         elif eval_type == "key3_q_a_rejected":
-            question_col = input_keys_map.get("question", "")
-            better_col = input_keys_map.get("better", "")
-            rejected_col = input_keys_map.get("rejected", "")
+            question_col = input_question_key or ""
+            better_col = input_better_key or ""
+            rejected_col = input_rejected_key or ""
             required = [question_col, better_col, rejected_col]
             if not self._check_columns(df, required):
                 storage.write(df)
@@ -867,6 +876,9 @@ def _default_metric_for_type(self, t: str, use_semantic_judge: bool) -> str:
     # 统计：binary（0/1）
     # -----------------------------
     def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_score_key = self.output_eval_score_key
+
         total = len(df)
         valid_mask = df[output_eval_valid_key] == True
         valid = int(valid_mask.sum())
@@ -885,6 +897,9 @@ def _stats_for_binary(self, df: pd.DataFrame) -> Dict[str, Any]:
     # 统计：多选（f1/jaccard 等）
     # -----------------------------
     def _stats_for_multiselect(self, df: pd.DataFrame) -> Dict[str, Any]:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_score_key = self.output_eval_score_key
+
         total = len(df)
         valid_mask = df[output_eval_valid_key] == True
         valid = int(valid_mask.sum())
@@ -912,6 +927,11 @@ def _eval_qa_single(
         ctx_series: Optional[pd.Series],
         metric_type: str,
     ) -> None:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_error_key = self.output_eval_error_key
+        output_eval_pred_key = self.output_eval_pred_key
+        output_eval_score_key = self.output_eval_score_key
+
         if metric_type == "semantic_judge":
             # 语义 judge 需要 llm_serving.generate_from_input
             if self.llm_serving is None or not hasattr(self.llm_serving, "generate_from_input"):
@@ -998,6 +1018,11 @@ def _eval_qa_multi(
         ctx_series: Optional[pd.Series],
         metric_type: str,
     ) -> None:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_error_key = self.output_eval_error_key
+        output_eval_pred_key = self.output_eval_pred_key
+        output_eval_score_key = self.output_eval_score_key
+
         # 默认：any_math_verify
         for idx, row in df.iterrows():
             targets_raw = row[targets_col]
@@ -1042,6 +1067,11 @@ def _eval_mc_single(
         metric_type: str,
         pred_col: str,
     ) -> None:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_error_key = self.output_eval_error_key
+        output_eval_pred_key = self.output_eval_pred_key
+        output_eval_score_key = self.output_eval_score_key
+
         # 优先：loglikelihood
         if metric_type == "ll_choice_acc" and self.llm_serving is not None:
             # 批量做：每行要对 choices 逐个算 ll，先实现清晰版（你后面可优化 batching）
@@ -1171,6 +1201,11 @@ def _eval_mc_multi(
         pred_col: str,
         metric_type: str,
     ) -> None:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_error_key = self.output_eval_error_key
+        output_eval_pred_key = self.output_eval_pred_key
+        output_eval_score_key = self.output_eval_score_key
+
         # 这里按你说的“先最小落地”：从 pred_col 解析集合 -> micro_f1
         for idx, row in df.iterrows():
             choices = row[choices_col]
@@ -1264,6 +1299,11 @@ def _eval_pairwise(
         ctx_series: Optional[pd.Series],
         metric_type: str,
     ) -> None:
+        output_eval_valid_key = self.output_eval_valid_key
+        output_eval_error_key = self.output_eval_error_key
+        output_eval_pred_key = self.output_eval_pred_key
+        output_eval_score_key = self.output_eval_score_key
+
         # 默认：pairwise_ll_winrate
         for idx, row in df.iterrows():
             q = row[question_col]
@@ -1356,7 +1396,15 @@ def get_desc(lang: str = "zh"):
                 "- use_semantic_judge：仅对 key2_qa 有效；是否使用语义评测\n\n"
                 "运行参数：\n"
                 "- storage：DataFlowStorage\n"
-                "- input_keys_map：字段映射（不同 eval_type 需要不同 key：text/question/target/targets/choices/label/labels/better/rejected）\n"
+                "- input_text_key：文本列名（key1_text_score）\n"
+                "- input_question_key：问题列名（key2/key3）\n"
+                "- input_target_key：单个参考答案列名（key2_qa）\n"
+                "- input_targets_key：多个参考答案列名（key2_q_ma）\n"
+                "- input_choices_key：选项列名（key3_q_choices_a/key3_q_choices_as）\n"
+                "- input_label_key：单个标签列名（key3_q_choices_a）\n"
+                "- input_labels_key：多个标签列名（key3_q_choices_as）\n"
+                "- input_better_key：优选答案列名（key3_q_a_rejected）\n"
+                "- input_rejected_key：劣选答案列名（key3_q_a_rejected）\n"
                 "- input_context_key：可选，上下文字段名\n"
                 "- input_pred_key：预测答案字段名（默认 generated_ans）\n\n"
                 "输出：\n"
@@ -1386,8 +1434,16 @@ def get_desc(lang: str = "zh"):
             "- use_semantic_judge: Only for key2_qa; whether to use LLM-based semantic judging\n\n"
             "Run Parameters:\n"
             "- storage: DataFlowStorage\n"
-            "- keys_map: Column mapping; depends on eval_type (text/question/target/targets/choices/label/labels/better/rejected)\n"
-            "- context_key: Optional context column name\n"
+            "- input_text_key: Text column name (key1_text_score)\n"
+            "- input_question_key: Question column name (key2/key3)\n"
+            "- input_target_key: Single reference answer column name (key2_qa)\n"
+            "- input_targets_key: Multiple reference answers column name (key2_q_ma)\n"
+            "- input_choices_key: Choices column name (key3_q_choices_a/key3_q_choices_as)\n"
+            "- input_label_key: Single label column name (key3_q_choices_a)\n"
+            "- input_labels_key: Multiple labels column name (key3_q_choices_as)\n"
+            "- input_better_key: Better answer column name (key3_q_a_rejected)\n"
+            "- input_rejected_key: Rejected answer column name (key3_q_a_rejected)\n"
+            "- input_context_key: Optional context column name\n"
             "- input_pred_key: Prediction column name (default: generated_ans)\n\n"
             "Output Parameters:\n"
             f"- output_eval_score_key: Numeric score (accuracy classes use 0/1)\n"
diff --git a/dataflow/operators/core_text/generate/bench_answer_generator.py b/dataflow/operators/core_text/generate/bench_answer_generator.py
index 796c2033..501c554a 100644
--- a/dataflow/operators/core_text/generate/bench_answer_generator.py
+++ b/dataflow/operators/core_text/generate/bench_answer_generator.py
@@ -1,6 +1,5 @@
 import json
 import inspect
-import re
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import numpy as np
@@ -9,11 +8,14 @@
 from dataflow import get_logger
 from dataflow.core import OperatorABC, LLMServingABC
 from dataflow.core.prompt import DIYPromptABC, prompt_restrict
+from dataflow.prompts.core_text import FormatStrPrompt
 from dataflow.utils.registry import OPERATOR_REGISTRY
 from dataflow.utils.storage import DataFlowStorage
 
 
-@prompt_restrict()  # 保持通用, 不强绑固定 prompt 类
+@prompt_restrict(
+    FormatStrPrompt
+    )
 
 @OPERATOR_REGISTRY.register()
 class BenchAnswerGenerator(OperatorABC):
@@ -22,8 +24,8 @@ class BenchAnswerGenerator(OperatorABC):
 
     输入:
       - eval_type: 评测类型, 取值同 evaluator
-      - keys_map: 指定各字段名, 同 evaluator
-      - context_key: 可选, 上下文字段名, 不传则 None
+      - 运行时通过 input_xxx_key 传入各字段名（未传默认 None）
+      - input_context_key: 可选, 上下文字段名, 不传则 None
     输出:
       - output_key: 生成结果列, 默认 generated_ans
       - 对于不需要生成的类型, 默认不写 output_key, 直接返回空列表
@@ -40,7 +42,7 @@ def __init__(
                 "key3_q_a_rejected",
             ] = "key2_qa",
         llm_serving: LLMServingABC = None,
-        prompt_template: DIYPromptABC = None,
+        prompt_template: Union[FormatStrPrompt, DIYPromptABC] = FormatStrPrompt,
         system_prompt: str = "You are a helpful assistant specialized in generating answers to questions.",
         allow_overwrite: bool = False,
         force_generate: bool = False,
@@ -135,23 +137,35 @@ def _build_prompt(
         if self.prompt_template is not None and hasattr(self.prompt_template, "build_prompt"):
             try:
                 fn = getattr(self.prompt_template, "build_prompt")
+
+                if eval_type in ("key3_q_choices_a", "key3_q_choices_as"):
+                    need_fields = {"question", "choices"}
+                else:
+                    need_fields = {"question"}
+
                 kwargs = {
                     "eval_type": eval_type,
                     "question": question,
-                    "context": context,
+                    "context": context or "",
                     "choices": choices,
-                    "choices_text": self._format_choices_text(choices) if choices else None,
+                    "choices_text": self._format_choices_text(choices) if choices else "",
                 }
-                kwargs = {k: v for k, v in kwargs.items() if v is not None}
 
                 sig = inspect.signature(fn)
-                params = sig.parameters.values()
+                params = list(sig.parameters.values())
                 has_varkw = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params)
+
+                accepted = {p.name for p in params if p.name != "self"}
+                expects_need_fields = "need_fields" in accepted
+
                 if has_varkw:
+                    if expects_need_fields:
+                        return fn(need_fields, **kwargs)
                     return fn(**kwargs)
 
-                accepted = {p.name for p in params if p.name != "self"}
                 filtered = {k: v for k, v in kwargs.items() if k in accepted}
+                if expects_need_fields:
+                    return fn(need_fields, **filtered)
                 return fn(**filtered)
             except Exception as e:
                 self.logger.error(f"prompt_template.build_prompt 失败, fallback 默认模板: {e}")
@@ -186,7 +200,15 @@ def _need_generation(self, eval_type: str) -> bool:
     def run(
         self,
         storage: DataFlowStorage,
-        input_keys_map: Dict[str, str],
+        input_text_key: Optional[str] = None,
+        input_question_key: Optional[str] = None,
+        input_target_key: Optional[str] = None,
+        input_targets_key: Optional[str] = None,
+        input_choices_key: Optional[str] = None,
+        input_label_key: Optional[str] = None,
+        input_labels_key: Optional[str] = None,
+        input_better_key: Optional[str] = None,
+        input_rejected_key: Optional[str] = None,
         input_context_key: Optional[str] = None,
         output_key: str = "generated_ans",
     ) -> List[str]:
@@ -205,16 +227,16 @@ def run(
             return []
 
         # 读取字段
-        q_col = input_keys_map.get("question")
+        q_col = input_question_key
         if not q_col or q_col not in df.columns:
-            self.logger.error(f"缺少 question 列, keys_map.question={q_col}")
+            self.logger.error(f"缺少 question 列, input_question_key={q_col}")
             storage.write(df)
             return []
 
-        ch_col = input_keys_map.get("choices")
+        ch_col = input_choices_key
         need_choices = eval_type in ("key3_q_choices_a", "key3_q_choices_as")
         if need_choices and (not ch_col or ch_col not in df.columns):
-            self.logger.error(f"缺少 choices 列, keys_map.choices={ch_col}")
+            self.logger.error(f"缺少 choices 列, input_choices_key={ch_col}")
             storage.write(df)
             return []
 
@@ -267,7 +289,15 @@ def get_desc(lang: str = "zh"):
                 "- force_generate：是否强制对可生成类型都生成\n\n"
                 "运行参数：\n"
                 "- storage：DataFlowStorage\n"
-                "- input_keys_map：字段映射，至少包含 question；选择题需包含 choices\n"
+                "- input_text_key：文本列名（key1_text_score）\n"
+                "- input_question_key：问题列名（key2/key3）\n"
+                "- input_target_key：单个参考答案列名（key2_qa）\n"
+                "- input_targets_key：多个参考答案列名（key2_q_ma）\n"
+                "- input_choices_key：选项列名（key3_q_choices_a/key3_q_choices_as）\n"
+                "- input_label_key：单个标签列名（key3_q_choices_a）\n"
+                "- input_labels_key：多个标签列名（key3_q_choices_as）\n"
+                "- input_better_key：优选答案列名（key3_q_a_rejected）\n"
+                "- input_rejected_key：劣选答案列名（key3_q_a_rejected）\n"
                 "- input_context_key：可选，上下文字段名\n"
                 "- output_key：生成结果列名（默认 generated_ans）\n\n"
                 "输出：\n"
@@ -285,8 +315,16 @@ def get_desc(lang: str = "zh"):
             "- force_generate: Whether to force generation for types that can be skipped by default\n\n"
             "Run Parameters:\n"
             "- storage: DataFlowStorage\n"
-            "- keys_map: Column mapping (requires question; for choice tasks requires choices)\n"
-            "- context_key: Optional context column name\n"
+            "- input_text_key: Text column name (key1_text_score)\n"
+            "- input_question_key: Question column name (key2/key3)\n"
+            "- input_target_key: Single reference answer column name (key2_qa)\n"
+            "- input_targets_key: Multiple reference answers column name (key2_q_ma)\n"
+            "- input_choices_key: Choices column name (key3_q_choices_a/key3_q_choices_as)\n"
+            "- input_label_key: Single label column name (key3_q_choices_a)\n"
+            "- input_labels_key: Multiple labels column name (key3_q_choices_as)\n"
+            "- input_better_key: Better answer column name (key3_q_a_rejected)\n"
+            "- input_rejected_key: Rejected answer column name (key3_q_a_rejected)\n"
+            "- input_context_key: Optional context column name\n"
             "- output_key: Output column name for generated answers (default: generated_ans)\n\n"
             "Output Parameters:\n"
             "- Writes output_key into the dataframe when generation is performed\n"
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
index 13941c02..37aa98f5 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py
@@ -13,22 +13,20 @@
 "key3_q_a_rejected",
 """
 
-DIY_PROMPT_ANSWER = """Please output the answer."""
 EVAL_TYPE = "key1_text_score"
-KEY_MAPS = {"text": "text"}
 
 class UnifiedBenchEvalPipeline():
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         
         self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type1.jsonl",
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type1.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
@@ -53,17 +51,16 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
+            input_text_key="text",
             input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
+            input_text_key="text",
             input_context_key=None,
             input_pred_key="generated_ans",
-
         )
 
 if __name__ == "__main__":
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
index 55f580df..605df4e0 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py
@@ -1,4 +1,5 @@
 from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.prompts.core_text import FormatStrPrompt
 from dataflow.core.prompt import DIYPromptABC
 from dataflow.utils.storage import FileStorage
 from dataflow.serving import LocalModelLLMServing_vllm
@@ -15,39 +16,31 @@
 """
 
 EVAL_TYPE = "key2_qa"
-KEY_MAPS = {
-    "question": "question",
-    "target": "golden_label"
-}
-
-class AnswerGeneratePromptDIY(DIYPromptABC):
-    def build_prompt(self, question:str = None):
-        prompt = f"""
-        Question: {question}
-        Answer:
-        """
-        return prompt
 
 class UnifiedBenchEvalPipeline():
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         
         self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type2.jsonl",
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type2.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
 
+        self.generation_prompt_template = FormatStrPrompt(
+            f_str_template="Question: {question}\nAnswer:",
+        )
+        
         self.answer_generator_step1 = BenchAnswerGenerator(
             llm_serving=self.llm_serving_generator,
             eval_type=EVAL_TYPE,
-            prompt_template=AnswerGeneratePromptDIY(),
+            prompt_template=self.generation_prompt_template,
             allow_overwrite=False,
             force_generate=False,
         )
@@ -64,14 +57,16 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
+            input_question_key="question",
+            input_target_key="golden_label",
             input_context_key=None,
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
+            input_question_key="question",
+            input_target_key="golden_label",
             input_context_key=None,
             input_pred_key="generated_ans",
         )
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
index 6992e080..9be6f441 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py
@@ -1,5 +1,6 @@
 from dataflow.pipeline.Pipeline import PipelineABC
 from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.prompts.core_text import FormatStrPrompt
 from dataflow.core.prompt import DIYPromptABC
 from dataflow.utils.storage import FileStorage
 from dataflow.serving import LocalModelLLMServing_vllm
@@ -16,41 +17,32 @@
 """
 
 EVAL_TYPE = "key2_q_ma"
-KEY_MAPS = {
-    "context": "context", # optional
-    "question": "question",
-    "targets": "targets"
-}
-
-class AnswerGeneratePromptDIY(DIYPromptABC):
-    def build_prompt(self, question:str = None):
-        prompt = f"""
-        Question: {question}
-        Answer:
-        """
-        return prompt
 
 class UnifiedBenchEvalPipeline(PipelineABC):
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         super().__init__()
         
         self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type3.jsonl",
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type3.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
 
+        self.generation_prompt_template = FormatStrPrompt(
+            f_str_template="Question: {question}\nAnswer:",
+        )
+
         self.answer_generator_step1 = BenchAnswerGenerator(
             llm_serving=self.llm_serving_generator,
             eval_type=EVAL_TYPE,
-            prompt_template=AnswerGeneratePromptDIY(),
+            prompt_template=self.generation_prompt_template,    
             allow_overwrite=False,
             force_generate=False,
         )
@@ -67,15 +59,17 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_targets_key="targets",
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_targets_key="targets",
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
index c84ccb4a..3716cc57 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py
@@ -1,6 +1,6 @@
 from dataflow.pipeline.Pipeline import PipelineABC
 from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
-from dataflow.core.prompt import DIYPromptABC
+from dataflow.prompts.core_text import FormatStrPrompt
 from dataflow.utils.storage import FileStorage
 from dataflow.serving import LocalModelLLMServing_vllm
 from dataflow.core import LLMServingABC
@@ -16,40 +16,32 @@
 """
 
 EVAL_TYPE = "key3_q_choices_a"
-KEY_MAPS = {
-    "context": "context", # optional
-    "question": "question",
-    "choices": "choices",
-    "label": "label"
-}
-
-class MMLUPromptDIY(DIYPromptABC):
-    def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs):
-        ctx = f"Context:\n{context}\n\n" if context else ""
-        return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:"
-
 
 class UnifiedBenchEvalPipeline(PipelineABC):
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         super().__init__()
         
         self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type4.jsonl",
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type4.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
 
+        self.generation_prompt_template = FormatStrPrompt(
+            f_str_template="{context}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:",
+        )
+
         self.answer_generator_step1 = BenchAnswerGenerator(
             llm_serving=self.llm_serving_generator,
             eval_type=EVAL_TYPE,
-            prompt_template=MMLUPromptDIY(),
+            prompt_template=self.generation_prompt_template,
             allow_overwrite=False,
             force_generate=False,
         )
@@ -66,15 +58,18 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_choices_key="choices",
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_choices_key="choices",
+            input_label_key="label",
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
index 461c65b9..56705cae 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py
@@ -1,6 +1,6 @@
 from dataflow.pipeline.Pipeline import PipelineABC
 from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
-from dataflow.core.prompt import DIYPromptABC
+from dataflow.prompts.core_text import FormatStrPrompt
 from dataflow.utils.storage import FileStorage
 from dataflow.serving import LocalModelLLMServing_vllm
 from dataflow.core import LLMServingABC
@@ -16,40 +16,32 @@
 """
 
 EVAL_TYPE = "key3_q_choices_as"
-KEY_MAPS = {
-    "context": "context", # optional
-    "question": "question",
-    "choices": "choices",
-    "labels": "labels"
-}
-
-class MMLUPromptDIY(DIYPromptABC):
-    def build_prompt(self, question: str = None, choices_text: str = None, context: str = None, **kwargs):
-        ctx = f"Context:\n{context}\n\n" if context else ""
-        return f"{ctx}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:"
-
 
 class UnifiedBenchEvalPipeline(PipelineABC):
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         super().__init__()
         
         self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type5.jsonl",
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type5.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
 
+        self.generation_prompt_template = FormatStrPrompt(
+            f_str_template="{context}Question:\n{question}\n\nChoices:\n{choices_text}\n\nAnswer:",
+        )
+
         self.answer_generator_step1 = BenchAnswerGenerator(
             llm_serving=self.llm_serving_generator,
             eval_type=EVAL_TYPE,
-            prompt_template=MMLUPromptDIY(),
+            prompt_template=self.generation_prompt_template,
             allow_overwrite=False,
             force_generate=False,
         )
@@ -66,15 +58,18 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_choices_key="choices",
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_choices_key="choices",
+            input_labels_key="labels",  
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
index a5ff886c..041c6b14 100644
--- a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py
@@ -1,6 +1,6 @@
 from dataflow.pipeline.Pipeline import PipelineABC
 from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
-from dataflow.core.prompt import DIYPromptABC
+from dataflow.prompts.core_text import FormatStrPrompt
 from dataflow.utils.storage import FileStorage
 from dataflow.serving import LocalModelLLMServing_vllm
 from dataflow.core import LLMServingABC
@@ -16,40 +16,32 @@
 """
 
 EVAL_TYPE = "key3_q_a_rejected"
-KEY_MAPS = {
-    "context": "context", # optional
-    "question": "question",
-    "better": "better",
-    "rejected": "rejected"
-}
-
-class PreferencePairwisePromptDIY(DIYPromptABC):
-    def build_prompt(self, question: str = None, context: str = None, **kwargs):
-        ctx = f"Context:\n{context}\n\n" if context else ""
-        return f"{ctx}Question:\n{question}\n\nAnswer:"
-
 
 class UnifiedBenchEvalPipeline(PipelineABC):
     def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
         super().__init__()
         
         self.storage = FileStorage(
-            first_entry_file_name="/mnt/DataFlow/scy/DataFlow/dataflow/example/core_text_data/unified_bench_eval_type6.jsonl",
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type6.jsonl",
             cache_path="./cache_local",
             file_name_prefix="dataflow_cache_step",
             cache_type="jsonl",
         )
 
         self.llm_serving_generator = LocalModelLLMServing_vllm(
-            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
             vllm_tensor_parallel_size=1,
             vllm_max_tokens=2048,
         )
 
+        self.generation_prompt_template = FormatStrPrompt(
+            f_str_template="{context}Question:\n{question}\n\nAnswer:",
+        )
+
         self.answer_generator_step1 = BenchAnswerGenerator(
             llm_serving=self.llm_serving_generator,
             eval_type=EVAL_TYPE,
-            prompt_template=PreferencePairwisePromptDIY(),
+            prompt_template=self.generation_prompt_template,
             allow_overwrite=False,
             force_generate=False,
         )
@@ -66,15 +58,17 @@ def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judg
     def forward(self):
         self.answer_generator_step1.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
             output_key="generated_ans",
         )
 
         self.evaluator_step2.run(
             storage=self.storage.step(),
-            input_keys_map=KEY_MAPS,
-            input_context_key=None,
+            input_context_key="context",
+            input_question_key="question",
+            input_better_key="better",
+            input_rejected_key="rejected",
             input_pred_key="generated_ans",
         )
 
diff --git a/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py
new file mode 100644
index 00000000..fae6fb81
--- /dev/null
+++ b/dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type_semantic.py
@@ -0,0 +1,83 @@
+from dataflow.operators.core_text import BenchAnswerGenerator, UnifiedBenchDatasetEvaluator
+from dataflow.prompts.core_text import FormatStrPrompt
+from dataflow.core.prompt import DIYPromptABC
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm, APILLMServing_request
+from dataflow.core import LLMServingABC
+
+"""
+all types:
+"key1_text_score",
+"key2_qa",
+"key2_q_ma",
+"key3_q_choices_a",
+"key3_q_choices_as",
+"key3_q_a_rejected",
+"""
+
+EVAL_TYPE = "key2_qa"
+
+class UnifiedBenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/unified_bench_eval_type2.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="/mnt/DataFlow/models/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+
+        # use API server as LLM serving
+        self.llm_serving_judger = APILLMServing_request(
+                    api_url="https://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=5
+        )
+
+        self.generation_prompt_template = FormatStrPrompt(
+            f_str_template="Question: {question}\nAnswer:",
+        )
+        
+        self.answer_generator_step1 = BenchAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            eval_type=EVAL_TYPE,
+            prompt_template=self.generation_prompt_template,
+            allow_overwrite=False,
+            force_generate=False,
+        )
+        
+        self.evaluator_step2 = UnifiedBenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            llm_serving=self.llm_serving_judger,
+            eval_type=EVAL_TYPE,
+            prompt_template=None,
+            use_semantic_judge=True,
+            metric_type=None,           # use default metric
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage=self.storage.step(),
+            input_question_key="question",
+            input_target_key="golden_label",
+            input_context_key=None,
+            output_key="generated_ans",
+        )
+
+        self.evaluator_step2.run(
+            storage=self.storage.step(),
+            input_question_key="question",
+            input_target_key="golden_label",
+            input_context_key=None,
+            input_pred_key="generated_ans",
+        )
+
+if __name__ == "__main__":
+    pl = UnifiedBenchEvalPipeline()
+    pl.forward()