feat(presentation_quality): upgrade grading to 1/3/5 scoring system with markdown cleanup

TaoShuchang · TaoShuchang · commit d9cbdc0cd650 · 2026-01-27T19:09:27.000+08:00
- Add function to strip markdown code block fences in grounding and presentation_quality modules
- Change presentation quality grader to score each of 8 criteria on a 1/3/5 scale instead of pass/fail
- Normalize total score by dividing sum of item scores by max (40), improving granularity
- Update reasoning output to list lowest scoring items with notes for focused feedback
- Revise presentation quality prompt to reflect new 1/3/5 scoring rubric with detailed instructions
- Adjust JSON output schema accordingly, replacing boolean pass with numeric score fields
- Add get_score utility in JSON utils to extract and validate scores from graded items
- Clean report input by removing markdown fences before grading to avoid markup noise
- Add grounding weight configuration in YAML template for improved modular judge weighting
diff --git a/tutorial/example_deep_finance/judge/grounding/json_utils.py b/tutorial/example_deep_finance/judge/grounding/json_utils.py
@@ -145,6 +145,20 @@ def _strip_think(text: str) -> str:
     return re.sub(r"<think>.*?</think>\s*", "", text, flags=re.S).strip()
 
 
+def _strip_markdown_fences(text: str) -> str:
+    """
+    清理 markdown 代码块标记
+    - 移除开头的 ```markdown / ```md / ``` 等
+    - 移除结尾的 ```
+    """
+    text = text.strip()
+    # 移除开头的 ```xxx
+    text = re.sub(r'^```(?:markdown|md)?\s*\n?', '', text, flags=re.IGNORECASE)
+    # 移除结尾的 ```
+    text = re.sub(r'\n?```\s*$', '', text)
+    return text.strip()
+
+
 def _normalize_traj(trajectory):
     """兼容 [[...]] 格式"""
     if isinstance(trajectory, list) and trajectory and isinstance(trajectory[0], list):
@@ -216,6 +230,9 @@ def construct_reward_prompt(trajectory: List[Dict[str, Any]], user_prompt_templa
                 final_report = _strip_think(_extract_text_content(traj[i].get("content")))
                 break
 
+    # 清理 markdown 代码块标记
+    final_report = _strip_markdown_fences(final_report)
+
     # 遍历提取 user_query, tool_calls, evidence
     for idx, step in enumerate(traj):
         role = step.get("role")
diff --git a/tutorial/example_deep_finance/judge/presentation_quality/grader.py b/tutorial/example_deep_finance/judge/presentation_quality/grader.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import re
 from typing import Any, Dict, List, Tuple
 
 from openjudge.graders.base_grader import BaseGrader
@@ -20,14 +21,14 @@
     B_KEYS,
     C_KEYS,
 )
-from .json_utils import strict_load_json, validate_shape, get_bool_pass, get_note
+from .json_utils import strict_load_json, validate_shape, get_score, get_note
 
 
 class PresentationQualityGrader(BaseGrader):
     """
     - 输入：report_content（研究报告文本）
     - 输出：GraderScore(name, score, reason)
-    - score：8项(pass)均分，范围[0,1]
+    - score：8项按1/3/5分制评分，总分归一化到[0,1]（总分/40）
     - determinism：建议用 temperature=0 + disable thinking 等（见 create_default_model）
     - 解析失败：score=0，并在 reason 显示报错
     """
@@ -92,7 +93,13 @@ async def aevaluate(
         入口：直接喂 report_content（研究报告文本）
         - user_query 可选：用于填充 prompt；不提供则用 "(unknown)"
         """
+
+        
         report = (report_content or "").strip()
+        
+        # 清理 markdown 代码块标记
+        report = self._strip_markdown_fences(report)
+        
         if not report:
             return GraderScore(
                 name=self.name,
@@ -143,6 +150,7 @@ async def aevaluate(
             )
 
         score, reason = self._score_and_reason(obj)
+        
         return GraderScore(name=self.name, score=score, reason=reason)
 
     def _score_and_reason(self, obj: Dict[str, Any]) -> Tuple[float, str]:
@@ -151,13 +159,13 @@ def _score_and_reason(self, obj: Dict[str, Any]) -> Tuple[float, str]:
         editorial = obj["editorial"]
         top_fixes = obj.get("top_fixes", [])
 
-        # 8项均分（强确定性：完全由Python算）
-        pass_map: Dict[str, bool] = {}
+        # 8项按1/3/5分制计分（强确定性：完全由Python算）
+        score_map: Dict[str, int] = {}
         note_map: Dict[str, str] = {}
 
         def take(section: Dict[str, Any], key: str):
             item = section.get(key)
-            pass_map[key] = get_bool_pass(item)
+            score_map[key] = get_score(item)
             note_map[key] = get_note(item)
 
         for k in A_KEYS:
@@ -167,21 +175,37 @@ def take(section: Dict[str, Any], key: str):
         for k in C_KEYS:
             take(editorial, k)
 
-        passed = sum(1 for k in ALL_KEYS if pass_map.get(k) is True)
-        total = len(ALL_KEYS)  # 8
-        score = passed / float(total)
+        # 总分 = 各项得分之和 / 最高可能分 (8*5=40)，归一化到[0,1]
+        total_score = sum(score_map.get(k, 1) for k in ALL_KEYS)
+        max_score = len(ALL_KEYS) * 5  # 8 * 5 = 40
+        score = total_score / float(max_score)
 
-        # reason：不加额外字段，只给紧凑总结
-        failed_items = [k for k in ALL_KEYS if not pass_map.get(k, False)]
-        failed_str = ", ".join(f"{k}({note_map.get(k,'')})" for k in failed_items[:4])
+        # reason：按分数排序，列出低分项
+        low_items = [(k, score_map.get(k, 1)) for k in ALL_KEYS if score_map.get(k, 1) < 5]
+        low_items.sort(key=lambda x: x[1])  # 从低到高
+        low_str = ", ".join(f"{k}={s}({note_map.get(k,'')})" for k, s in low_items[:4])
         fixes_str = " | ".join(str(x) for x in (top_fixes or [])[:3])
 
         parts: List[str] = []
-        parts.append(f"Pass {passed}/{total}")
-        if failed_items:
-            parts.append(f"Fail: {failed_str}")
+        parts.append(f"Score {total_score}/{max_score}")
+        if low_items:
+            parts.append(f"Low: {low_str}")
         if fixes_str:
             parts.append(f"TopFixes: {fixes_str}")
 
         reason = " ; ".join(parts)
         return round(score, 6), reason[:800]
+
+    @staticmethod
+    def _strip_markdown_fences(text: str) -> str:
+        """
+        清理 markdown 代码块标记
+        - 移除开头的 ```markdown / ```md / ``` 等
+        - 移除结尾的 ```
+        """
+        text = text.strip()
+        # 移除开头的 ```xxx
+        text = re.sub(r'^```(?:markdown|md)?\s*\n?', '', text, flags=re.IGNORECASE)
+        # 移除结尾的 ```
+        text = re.sub(r'\n?```\s*$', '', text)
+        return text.strip()
diff --git a/tutorial/example_deep_finance/judge/presentation_quality/json_utils.py b/tutorial/example_deep_finance/judge/presentation_quality/json_utils.py
@@ -51,6 +51,28 @@ def get_bool_pass(item: Any) -> bool:
     return False
 
 
+def get_score(item: Any) -> int:
+    """
+    Extract numeric score (1, 3, 5) from item.
+    Returns 1 as default if invalid.
+    """
+    if isinstance(item, dict):
+        v = item.get("score")
+    else:
+        v = item
+    if isinstance(v, (int, float)):
+        v = int(v)
+        if v in (1, 3, 5):
+            return v
+        # clamp to valid range
+        if v <= 1:
+            return 1
+        if v >= 5:
+            return 5
+        return 3
+    return 1
+
+
 def get_note(item: Any) -> str:
     if isinstance(item, dict):
         note = item.get("note", "")
diff --git a/tutorial/example_deep_finance/judge/presentation_quality/prompt.py b/tutorial/example_deep_finance/judge/presentation_quality/prompt.py
@@ -1,86 +1,100 @@
 # 8项呈现质量检查：A(3)+B(3)+C(2)=8
 QUALITY_SYSTEM_PROMPT = """
-你是一位“呈现质量评审官”。你只评估报告的**呈现与表达质量 (Presentation & Editorial Quality)**，用于奖励信号。
-严禁评估：事实真伪/引用支持（Grounding 负责）、内容覆盖广度（Breadth 负责）、分析深度与洞察（Depth 负责）、观点是否正确。
-核心关注：**可扫描性**、**信息结构化**、**逻辑链条的可视化呈现**、**表达清晰与可用性**。
+你是一位“深度研究报告呈现评审官”。你的任务是评估报告的 **用户体验与信息架构 (Presentation & UX)**，为强化学习提供奖励信号。
+
+**严禁评估**：事实真伪、引用准确性（由 Grounding 模型负责）、内容广度与深度。
+**核心关注**：**认知负荷管理**、**信息的可扫读性**、**逻辑的可视化**、**Markdown 渲染质量**。
 
 ========================
-评分标准（仅判定 pass=true/false）
+评分标准 (1/3/5 分制)
 ========================
-对以下 8 个检查项分别给出 pass/fail，并给一句 note（≤25字，需指出“位置或症状”，避免空泛）。
-
-A) Scan & Navigation（可扫描性）
-A1 结论先行（Key Takeaways Top）
-- Pass：开头可见“摘要/要点/核心结论”块（短段或列表均可），读者无需通读即可抓到主结论。
-- Fail：开头直接进入细节/材料堆叠，无概括性要点。
-
-A2 结构导航（Navigable Structure）
-- Pass：正文有清晰分节（标题层级或明显分段），读者能快速定位主要部分（分析/风险/结论等）。
-- Fail：无结构或结构混乱，像长篇流水账，难以导航。
-
-A3 视觉重点（Visual Hierarchy）
-- Pass：重点信息对“扫读友好”（要点化/短句分行/适度强调等），且重点承载信息而非装饰。
-- Fail：全文平铺直叙；或存在明显“格式堆砌”但不增信息。
-
-B) Information Structuring（信息结构化）
-B1 密集信息解构（Dense Info Structured）
-- Pass：数字/多条件/多点信息密集处被列表/分组/表格等拆解，易读易取。
-- Fail：关键数据淹没在长难句或长段落（典型：数字长句串联）。
-
-B2 对比对齐（Comparisons Aligned）
-- Pass：涉及横向对比（A vs B/同行对比/情景对比）时，用表格或对齐结构呈现，使维度一眼可比（不强制表格）。
-- Fail：对比点散落在不同段落，维度不对齐，无法直观对照。
-
-B3 一致性（Consistency）
-- Pass：单位/口径/标点/小标题/列表风格整体统一，专业感稳定。
-- Fail：格式与表述明显混乱，增加阅读负担。
-
-C) Editorial Clarity（编辑清晰度）
-C1 论证链可视化（Argument Chain Presented）
-- Pass：在呈现上能跟随“主张→依据→解释→影响/结论”的链条（例如用分段或 bullet 串联/对齐呈现），不是只堆材料。
-- Fail：大量材料堆砌，但缺少可视化的逻辑线索（读者难跟随）。
-
-C2 风险与行动（Risk & Actionability Clear）
-- Pass：以清晰形式列出风险/边界/不确定性，并给出可执行的下一步关注点（只看表达是否清楚存在，不评全面与正确）。
-- Fail：未提及风险/边界/下一步，或表述极度含糊不可操作。
-
-反刷分原则（必须执行）：
-- 空标题占位、空表格/无意义表格、重复 bullet 但不增加信息 → 相关项直接判 fail，并在 note 标注“形式堆砌”。
+对以下 8 个维度进行打分。
+- **1分 (Fail)**：严重阻碍阅读，格式混乱或缺失。
+- **3分 (Pass)**：甚至及格，有基本结构，但平庸、啰嗦或不够直观。
+- **5分 (Excellent)**：出版级质量，结构极佳，一眼能抓取核心，降低了读者的认知成本。
+
+请针对每个子项给出分数（1, 3, 5）及 Note（≤25字，指出具体位置或症状）。
+
+### A) Scan & Navigation（可扫描性）
+**A1 结论先行 (Key Takeaways Top)**
+- 5分：开头有独立的“核心摘要/TL;DR”块，且要点清晰，读者无需滚动即可获取主结论。
+- 3分：有摘要，但写成了流水账段落，或混杂在正文中不够醒目。
+- 1分：无摘要，开篇即陷入细节或背景介绍。
+
+**A2 结构导航 (Navigable Structure)**
+- 5分：层级分明 (H1/H2/H3)，长文有清晰的“路标”（小标题），支持快速跳读定位。
+- 3分：有分节，但段落过长（Wall of text），缺乏内部视觉引导。
+- 1分：结构混乱，标题层级错误或缺失，难以导航。
+
+**A3 视觉重点 (Visual Hierarchy)**
+- 5分：利用 **加粗**、*斜体* 或 `代码块` 精准强调核心洞察，信噪比高。
+- 3分：有强调，但过度使用（满篇加粗）或重点不突出（强调了无关词）。
+- 1分：全文平铺直叙，无任何视觉重点。
+
+### B) Information Structuring（信息结构化）
+**B1 密集信息解构 (Dense Info Structured)**
+- 5分：复杂数据/多条件逻辑被转化为 Markdown **表格** 或 **嵌套列表**，一目了然。
+- 3分：使用了列表，但内容仍是长难句堆砌，未真正拆解信息。
+- 1分：关键数字或复杂参数淹没在长段落文本中。
+
+**B2 对比对齐 (Comparisons Aligned)**
+- 5分：涉及对比（方案A vs B / 历史 vs 现状）时，使用表格或对齐结构，维度横向可比。
+- 3分：有对比意图，但分散在不同段落，读者需来回对照。
+- 1分：对比维度混乱或缺失，无法直观比较。
+
+**B3 一致性与渲染 (Consistency & Rendering)**
+- 5分：格式统一（符号/单位），Markdown 渲染完美（表格无断裂、公式无乱码）。
+- 3分：存在少量格式不统一，或轻微的渲染瑕疵但不影响理解。
+- 1分：表格错位、公式未闭合、列表层级混乱，严重影响阅读。
+
+### C) Editorial Clarity（编辑清晰度）
+**C1 论证链可视化 (Argument Chain Presented)**
+- 5分：逻辑链条可视（如使用 `主张 -> 证据 -> 结论` 的结构），引用锚点清晰 `[1]`。
+- 3分：逻辑存在，但淹没在文字中，缺乏连接词或视觉引导。
+- 1分：材料堆砌，缺乏清晰的推导线索。
+
+**C2 风险与行动 (Risk & Actionability Clear)**
+- 5分：独立板块清晰列出“风险/局限性”及“下一步建议”，具有极高的可操作性。
+- 3分：提到了风险或建议，但含糊其辞，或混杂在结论中。
+- 1分：完全未提及风险边界或下一步行动。
+
+**反刷分原则 (Anti-Gaming)**：
+- 空表格、无意义的重复列表、为了格式而格式（如把一句简单的话硬拆成列表） -> 直接判 **1分**，Note 标注“过度格式化”。
 
 ========================
-输出要求（Strict JSON）
+输出要求 (Strict JSON)
 ========================
-必须输出可解析 JSON；pass 必须为 boolean。
-不要输出 Markdown；不要添加额外字段；不得省略字段。
+必须输出可解析 JSON。
+**注意**：为了提供梯度信号，字段由 `pass` 改为 `score`，值必须为 1, 3, or 5。
 
-JSON 模板（字段必须齐全）：
+JSON 模板：
 {
   "scan": {
-    "A1_key_takeaways_top": {"pass": true, "note": "≤25字定位理由"},
-    "A2_navigable_structure": {"pass": true, "note": "≤25字定位理由"},
-    "A3_visual_hierarchy": {"pass": true, "note": "≤25字定位理由"}
+    "A1_key_takeaways_top": {"score": 0, "note": "≤25字定位理由"},
+    "A2_navigable_structure": {"score": 0, "note": "≤25字定位理由"},
+    "A3_visual_hierarchy": {"score": 0, "note": "≤25字定位理由"}
   },
   "structuring": {
-    "B1_dense_info_structured": {"pass": false, "note": "≤25字定位理由"},
-    "B2_comparisons_aligned": {"pass": true, "note": "≤25字定位理由"},
-    "B3_consistency": {"pass": true, "note": "≤25字定位理由"}
+    "B1_dense_info_structured": {"score": 0, "note": "≤25字定位理由"},
+    "B2_comparisons_aligned": {"score": 0, "note": "≤25字定位理由"},
+    "B3_consistency": {"score": 0, "note": "≤25字定位理由"}
   },
   "editorial": {
-    "C1_argument_chain_presented": {"pass": false, "note": "≤25字定位理由"},
-    "C2_risk_and_actionability_clear": {"pass": true, "note": "≤25字定位理由"}
+    "C1_argument_chain_presented": {"score": 0, "note": "≤25字定位理由"},
+    "C2_risk_and_actionability_clear": {"score": 0, "note": "≤25字定位理由"}
   },
-  "top_fixes": ["最多3条，仅谈呈现层面改进"]
+  "top_fixes": ["最多3条，仅谈呈现层面改进，针对最低分项"]
 }
 """
 
 USER_PROMPT_TEMPLATE = """
 请审计以下研究报告的【呈现质量】（只谈呈现/排版/结构，不谈事实对错/引用支持/覆盖/深度）。
 
 ### User Query
-{{user_query}}
+{user_query}
 
 ### AI Report
-{{report_content}}
+{report_content}
 
 -----
 请严格按 System Prompt 的锚点输出 JSON；不要输出 Markdown；不要添加额外字段。
diff --git a/tutorial/example_deep_finance/yaml_template/deep_finance_template.yaml b/tutorial/example_deep_finance/yaml_template/deep_finance_template.yaml
@@ -1,6 +1,6 @@
 # ------------------ 主要配置 ------------------
 ajet:
-  project_name: ajet_deep_finance
+  project_name: "{{PREFIX}}"
   experiment_name: "{{SUFFIX}}"
   # Judge 配置（嵌套结构，对应 self.config.ajet.judge.*）
   judge:
@@ -11,6 +11,7 @@ ajet:
     val_ref_ans_path: {{VAL_REF_ANS_PATH}}       # 验证集 Reference Answer 路径
   # OpenJudge 权重配置
   presentation_quality_weight: {{PRESENTATION_QUALITY_WEIGHT}}   # 报告呈现质量评估
+  grounding_weight: {{GROUNDING_WEIGHT}}                         # 引用规范性评估
   rm_weight: {{RM_WEIGHT}}                                       # RM Gallery 权重
   task_judge:
     # 使用本地 DeepFinanceJudge 进行评估（解耦远程 env_service）