feat(paper_review): upgrade rebuttal prompts with discipline/venue/instructions support

XiaoBoAI · XiaoBoAI · commit 80510714aacb · 2026-03-24T14:59:39.000+08:00
Bring rebuttal prompts to the same professional academic level as review prompts:

- Deep integration with DisciplineConfig: evaluation_dimensions, correctness_categories,
  reviewer_context, scoring_notes all flow into rebuttal generation and assessment
- Venue-aware: rebuttal generation adapts to venue conventions; assessment applies
  venue-specific acceptance bar and contribution standards
- Author instructions support for rebuttal generation (same as review prompt)
- Professional AC identity with high-standards calibration for assessment
- Structured assessment framework: Relevance, Evidence Strength, Completeness,
  Verifiability, Honesty — mirroring the rigor of the review evaluation dimensions
- Explicit score update rules: increase/decrease/maintain with concrete criteria
- Graders updated to pass venue and instructions parameters through to prompts

Made-with: Cursor
diff --git a/cookbooks/paper_review/graders/rebuttal_assessment.py b/cookbooks/paper_review/graders/rebuttal_assessment.py
@@ -59,6 +59,7 @@ def build_rebuttal_assessment_messages(
     rebuttal_text: str,
     original_score: int,
     discipline: Optional[DisciplineConfig] = None,
+    venue: Optional[str] = None,
     language: Optional[str] = None,
 ) -> List[dict]:
     """Build messages for rebuttal assessment."""
@@ -72,6 +73,7 @@ def build_rebuttal_assessment_messages(
             "role": "system",
             "content": get_rebuttal_assessment_system_prompt(
                 discipline=discipline,
+                venue=venue,
                 language=language,
             ),
         },
@@ -95,6 +97,7 @@ def __init__(
         self,
         model: BaseChatModel | dict,
         discipline: Optional[DisciplineConfig] = None,
+        venue: Optional[str] = None,
         language: Optional[str] = None,
     ):
         super().__init__(
@@ -105,6 +108,7 @@ def __init__(
             template="",
         )
         self.discipline = discipline
+        self.venue = venue
         self.language = language
 
     async def aevaluate(
@@ -132,6 +136,7 @@ async def aevaluate(
                 rebuttal_text,
                 original_score,
                 discipline=self.discipline,
+                venue=self.venue,
                 language=self.language,
             )
             response = await self.model.achat(messages=messages)
diff --git a/cookbooks/paper_review/graders/rebuttal_generation.py b/cookbooks/paper_review/graders/rebuttal_generation.py
@@ -52,6 +52,7 @@ def build_rebuttal_generation_messages(
     review_text: str,
     discipline: Optional[DisciplineConfig] = None,
     venue: Optional[str] = None,
+    instructions: Optional[str] = None,
     language: Optional[str] = None,
 ) -> List[dict]:
     """Build messages for rebuttal generation."""
@@ -62,6 +63,7 @@ def build_rebuttal_generation_messages(
             "content": get_rebuttal_generation_system_prompt(
                 discipline=discipline,
                 venue=venue,
+                instructions=instructions,
                 language=language,
             ),
         },
@@ -87,6 +89,7 @@ def __init__(
         model: BaseChatModel | dict,
         discipline: Optional[DisciplineConfig] = None,
         venue: Optional[str] = None,
+        instructions: Optional[str] = None,
         language: Optional[str] = None,
     ):
         super().__init__(
@@ -98,6 +101,7 @@ def __init__(
         )
         self.discipline = discipline
         self.venue = venue
+        self.instructions = instructions
         self.language = language
 
     async def aevaluate(self, pdf_data: str, review_text: str) -> GraderScore:
@@ -116,6 +120,7 @@ async def aevaluate(self, pdf_data: str, review_text: str) -> GraderScore:
                 review_text,
                 discipline=self.discipline,
                 venue=self.venue,
+                instructions=self.instructions,
                 language=self.language,
             )
             response = await self.model.achat(messages=messages)
diff --git a/cookbooks/paper_review/pipeline.py b/cookbooks/paper_review/pipeline.py
@@ -166,11 +166,13 @@ def _init_graders(self):
             self.model,
             discipline=self._discipline,
             venue=self._venue,
+            instructions=self._instructions,
             language=self._language,
         )
         self.rebuttal_assessment_grader = RebuttalAssessmentGrader(
             self.model,
             discipline=self._discipline,
+            venue=self._venue,
             language=self._language,
         )
 
diff --git a/cookbooks/paper_review/prompts/rebuttal_assessment.py b/cookbooks/paper_review/prompts/rebuttal_assessment.py
@@ -10,74 +10,160 @@
 def get_rebuttal_assessment_system_prompt(
     date: datetime | None = None,
     discipline: Optional[DisciplineConfig] = None,
+    venue: Optional[str] = None,
     language: Optional[str] = None,
 ) -> str:
     """Get system prompt for rebuttal assessment.
 
     Args:
         date: Date to use (defaults to today).
         discipline: Discipline-specific configuration.
+        venue: Target conference/journal name.
         language: Output language ("en" default, "zh" for Chinese).
     """
     current_date = (date or datetime.now()).strftime("%Y-%m-%d")
 
+    # ── AC / Meta-Reviewer identity ───────────────────────────────────────────
     if discipline:
-        identity = (
-            f"You are a senior Area Chair / Meta-Reviewer in {discipline.name}, "
-            f"responsible for evaluating whether the authors' rebuttal adequately "
-            f"addresses the reviewers' concerns."
+        discipline_label = discipline.name
+        reviewer_context = discipline.reviewer_context or f"You specialize in {discipline_label}."
+        identity_block = (
+            f"You are a senior Area Chair / Meta-Reviewer for a top venue in {discipline_label}. "
+            f"You are the most experienced and fair-minded AC in the field.\n"
+            f"{reviewer_context}\n"
+            f"You have served on dozens of program committees and have deep understanding of "
+            f"what constitutes a convincing rebuttal versus a superficial or evasive one."
         )
     else:
-        identity = (
-            "You are a senior Area Chair / Meta-Reviewer responsible for evaluating "
-            "whether the authors' rebuttal adequately addresses the reviewers' concerns."
+        identity_block = (
+            "You are a senior Area Chair / Meta-Reviewer for a top academic venue. "
+            "You are the most experienced and fair-minded AC in the field.\n"
+            "You have served on dozens of program committees and have deep understanding of "
+            "what constitutes a convincing rebuttal versus a superficial or evasive one."
         )
 
+    # ── Venue context ─────────────────────────────────────────────────────────
+    if venue:
+        venue_block = (
+            f"\n**Target Venue: {venue}**\n"
+            f"You are making a recommendation for **{venue}**. Apply {venue}'s specific "
+            f"standards and contribution bar when judging whether the rebuttal resolves "
+            f"the concerns sufficiently for this venue. Consider {venue}'s acceptance rate, "
+            f"audience expectations, and the level of rigor required."
+        )
+    elif discipline and discipline.venues:
+        venue_list = discipline.format_venues()
+        venue_block = (
+            f"\nYou typically serve as AC for top venues in {discipline.name}, "
+            f"such as: {venue_list}. Apply corresponding standards."
+        )
+    else:
+        venue_block = (
+            "\nYou typically serve as AC for top venues such as " "NeurIPS, ICLR, ICML, Nature, Science, The Lancet."
+        )
+
+    # ── Evaluation dimensions for assessment ──────────────────────────────────
+    if discipline and discipline.evaluation_dimensions:
+        dimensions_block = (
+            "\nWhen evaluating whether reviewer concerns are adequately addressed, consider "
+            "how the rebuttal impacts these key evaluation dimensions that reviewers in this "
+            "field prioritize:\n\n" + discipline.format_evaluation_dimensions()
+        )
+    else:
+        dimensions_block = (
+            "\nWhen evaluating whether concerns are adequately addressed, consider how the "
+            "rebuttal impacts these dimensions: Quality, Clarity, Significance, Originality, "
+            "Reproducibility, Ethics & Limitations, and Citations & Related Work."
+        )
+
+    # ── Discipline-specific error awareness ───────────────────────────────────
+    if discipline and discipline.correctness_categories:
+        error_awareness_block = (
+            "\nReviewers in this field commonly flag these types of errors. When assessing "
+            "whether the rebuttal resolves such concerns, apply the appropriate level of "
+            "scrutiny for each category:\n\n" + discipline.format_correctness_categories()
+        )
+    else:
+        error_awareness_block = ""
+
+    # ── Scoring ───────────────────────────────────────────────────────────────
+    scoring_block = """Scoring (1-6):
+1: Strong Reject - Well-known results, technical flaws, or unaddressed ethical considerations
+2: Reject - Technical flaws, weak evaluation, inadequate reproducibility
+3: Borderline Reject - Technically solid but reasons to reject outweigh reasons to accept
+4: Borderline Accept - Technically solid where reasons to accept outweigh reasons to reject
+5: Accept - Technically solid with high impact, good evaluation
+6: Strong Accept - Technically flawless with groundbreaking impact"""
+
+    if discipline and discipline.scoring_notes:
+        scoring_block += f"\n\nDiscipline-specific guidance: {discipline.scoring_notes}"
+
+    # ── Output language ───────────────────────────────────────────────────────
     if language == "zh":
         language_block = (
             "\n**Output Language: Chinese (Simplified)**\n"
-            "You MUST write the entire assessment in Simplified Chinese (简体中文)."
+            "You MUST write the entire assessment in Simplified Chinese (简体中文). "
+            "Technical terms may remain in English where conventional."
         )
     else:
         language_block = ""
 
-    return f"""{identity}
+    return f"""{identity_block}
 
 **Current Date: {current_date}**
-{language_block}
+Note: References to papers from 2024, 2025, or 2026 are valid and should NOT be flagged as "future" papers.
+{venue_block}{language_block}
+
+You keep incredibly high standards. A convincing rebuttal must meet ALL of the following criteria:
+- Addresses the **specific** concern raised, not a straw-man or adjacent issue
+- Provides **concrete evidence** (experimental results, theoretical arguments, citations) rather than vague promises
+- Maintains **intellectual honesty** — acknowledging genuine limitations rather than deflecting
+- Demonstrates **scholarly professionalism** in tone and substance
 
 You are given:
 1. The original paper (PDF)
 2. The reviewer comments
 3. The authors' rebuttal
 4. The original recommendation score (1-6)
-
-Your task:
-1. For each reviewer concern, determine whether the rebuttal addresses it:
-   - "fully_addressed": The response is convincing with sufficient evidence or clarification.
-   - "partially_addressed": The response acknowledges the issue but the resolution is incomplete.
-   - "not_addressed": The concern is ignored or the response is inadequate.
-2. Evaluate the overall quality of the rebuttal (professionalism, evidence, honesty).
-3. Based on the rebuttal, decide an updated recommendation score (1-6). The score may go up, stay the same, or go down.
-
-Scoring reminder (1-6):
-1: Strong Reject  2: Reject  3: Borderline Reject  4: Borderline Accept  5: Accept  6: Strong Accept
+{dimensions_block}
+{error_awareness_block}
+
+ASSESSMENT FRAMEWORK — For each reviewer concern, evaluate:
+1. **Relevance**: Does the response address the actual concern or a different/adjacent issue?
+2. **Evidence strength**: Is the response backed by concrete evidence (data, proofs, citations) or just assertions?
+3. **Completeness**: Does the response fully resolve the concern, or are aspects left unaddressed?
+4. **Verifiability**: Can the claims in the rebuttal be verified against the paper content?
+5. **Honesty**: Does the author honestly acknowledge limitations, or are they deflecting/dismissing valid criticism?
+
+ADEQUACY CLASSIFICATION:
+- "fully_addressed": The response directly addresses the concern with convincing evidence or clarification that can be verified against the paper. The concern no longer stands as a reason to reject.
+- "partially_addressed": The response acknowledges the concern and provides some evidence, but the resolution is incomplete — e.g., promised experiments not yet shown, partial clarification that leaves open questions, or evidence that only addresses part of the concern.
+- "not_addressed": The concern is ignored, the response is off-topic, the argument is circular, or the evidence provided does not actually resolve the issue.
+
+SCORE UPDATE RULES:
+- The score may increase if major concerns are convincingly resolved with strong evidence.
+- The score stays the same if the rebuttal is adequate but does not change the fundamental assessment.
+- The score may DECREASE if the rebuttal reveals new weaknesses (contradictions, misunderstandings of own work, dishonest framing).
+- A single unresolved major concern is sufficient reason to maintain or lower the score.
+- Promises of future work ("we will add...") without concrete evidence carry minimal weight.
+
+{scoring_block}
 
 Return your assessment as JSON:
 {{
   "updated_score": <int 1-6>,
-  "score_change_reasoning": "Why the score changed (or didn't)",
-  "overall_assessment": "High-level summary of the rebuttal quality",
+  "score_change_reasoning": "Why the score changed (or didn't), referencing specific concerns",
+  "overall_assessment": "High-level summary of the rebuttal quality and its impact on the paper's standing",
   "point_assessments": [
     {{
-      "concern": "The reviewer's original concern",
-      "author_response_summary": "Brief summary of author's response",
+      "concern": "The reviewer's original concern (verbatim or faithfully summarized)",
+      "author_response_summary": "Brief summary of the author's response",
       "adequacy": "fully_addressed" or "partially_addressed" or "not_addressed",
-      "reasoning": "Why you judged it this way"
+      "reasoning": "Detailed reasoning for your judgment, referencing evidence from the paper and rebuttal"
     }}
   ],
-  "unresolved_concerns": ["List of concerns that remain unresolved"],
-  "rebuttal_strengths": ["What the rebuttal did well"]
+  "unresolved_concerns": ["Specific concerns that remain unresolved after the rebuttal"],
+  "rebuttal_strengths": ["What the rebuttal did particularly well"]
 }}"""
 
 
@@ -89,4 +175,6 @@ def get_rebuttal_assessment_system_prompt(
 Author rebuttal:
 {rebuttal_text}
 
-Please assess whether the rebuttal adequately addresses each concern and provide an updated recommendation score."""
+Carefully evaluate each point in the rebuttal against the original paper and reviewer comments. \
+Determine whether each concern is fully addressed, partially addressed, or not addressed. \
+Then provide an updated recommendation score with detailed justification."""
diff --git a/cookbooks/paper_review/prompts/rebuttal_generation.py b/cookbooks/paper_review/prompts/rebuttal_generation.py
diff --git a/examplex/marg-reviewer b/examplex/marg-reviewer

Original file line number	Diff line number	Diff line change
`@@ -166,11 +166,13 @@ def _init_graders(self):`
`166`	`166`	`self.model,`
`167`	`167`	`discipline=self._discipline,`
`168`	`168`	`venue=self._venue,`
	`169`	`+ instructions=self._instructions,`
`169`	`170`	`language=self._language,`
`170`	`171`	`)`
`171`	`172`	`self.rebuttal_assessment_grader = RebuttalAssessmentGrader(`
`172`	`173`	`self.model,`
`173`	`174`	`discipline=self._discipline,`
	`175`	`+ venue=self._venue,`
`174`	`176`	`language=self._language,`
`175`	`177`	`)`
`176`	`178`