Merge pull request #11 from MiroMindAI/test/yfd

BinWang28 · web-flow · commit b68b7265786e · 2026-01-28T15:04:14.000+08:00
feat(agent): add rollback retry and exceed max turn summary
diff --git a/config/benchmark/default.yaml b/config/benchmark/default.yaml
@@ -8,5 +8,7 @@ data:
 
 execution:
   max_tasks: null  # null means no limit
-  max_concurrent: 5 
-  pass_at_k: 1
+  max_concurrent: 5
+  pass_at_k: 1
+  max_retry: 1
+  exceed_max_turn_summary: false
diff --git a/config/benchmark/gaia-validation-text-only.yaml b/config/benchmark/gaia-validation-text-only.yaml
@@ -11,9 +11,9 @@ data:
 execution:
   max_tasks: null
   max_concurrent: 3
-  pass_at_k: 3
-  stop_condition: "valid_box"  # Options: "correct", "valid_box", "max_turn"
-  enable_failure_experience: true
+  pass_at_k: 1
+  max_retry: 5
+  exceed_max_turn_summary: true
 
 openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
 openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
diff --git a/config/binwang_fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml b/config/binwang_fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools_toolblacklist.yaml
@@ -26,14 +26,14 @@ main_agent:
   output_processor:
     - ${output-summary}
     - ${output-final-answer-extraction}
-    - ${output-failure-experience}
+    - ${output-exceed-max-turn-summary}
 
 input-message-generator:
   type: InputMessageGenerator
 output-summary:
   type: SummaryGenerator
-output-failure-experience:
-  type: FailureExperienceSummaryGenerator
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
   prompt: config/prompts/fangda_prompt_main_agent.yaml
   llm:
     _base_: config/llm/base_mirothinker.yaml
diff --git a/config/fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback.yaml b/config/fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback.yaml
@@ -21,7 +21,7 @@ main_agent:
   output_processor:
     - ${output-summary}
     - ${output-final-answer-extraction}
-    - ${output-failure-experience}
+    - ${output-exceed-max-turn-summary}
 
 input-message-generator:
   type: InputMessageGenerator
@@ -32,8 +32,8 @@ output-final-answer-extraction:
   prompt: config/prompts/fangda_prompt_main_agent.yaml
   llm:
     _base_: config/llm/base_mirothinker.yaml
-output-failure-experience:
-  type: FailureExperienceSummaryGenerator
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
   prompt: config/prompts/fangda_prompt_main_agent.yaml
   llm:
     _base_: config/llm/base_mirothinker.yaml
diff --git a/config/fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools.yaml b/config/fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools.yaml
@@ -21,14 +21,14 @@ main_agent:
   output_processor:
     - ${output-summary}
     - ${output-final-answer-extraction}
-    - ${output-failure-experience}
+    - ${output-exceed-max-turn-summary}
 
 input-message-generator:
   type: InputMessageGenerator
 output-summary:
   type: SummaryGenerator
-output-failure-experience:
-  type: FailureExperienceSummaryGenerator
+output-exceed-max-turn-summary:
+  type: ExceedMaxTurnSummaryGenerator
   prompt: config/prompts/fangda_prompt_main_agent.yaml
   llm:
     _base_: config/llm/base_mirothinker.yaml
diff --git a/config/prompts/fangda_prompt_main_agent.yaml b/config/prompts/fangda_prompt_main_agent.yaml
@@ -303,9 +303,9 @@ template:
 
       Extract the final answer in \boxed{} format:
 
-  failure_summary_prompt:
+  exceed_max_turn_summary_prompt:
     components:
-      - basic_failure_summary_prompt
+      - basic_exceed_max_turn_summary_prompt
 
     required_context:
       - task_description
@@ -315,7 +315,7 @@ template:
       - final_boxed_answer
       - error_message
 
-    basic_failure_summary_prompt: |
+    basic_exceed_max_turn_summary_prompt: |
       The task was not completed successfully. Do NOT call any tools. Provide a structured summary:
 
       Failure type: [incomplete / blocked / misdirected / format_missed]
@@ -346,27 +346,27 @@ template:
       Error Encountered: {{ error_message }}
       {% endif %}
 
-  failure_experience_header:
+  exceed_max_turn_summary_header:
     components:
       - header_text
 
     header_text: |
       === Previous Attempts Analysis ===
       The following summarizes what was tried before and why it did not work. Use this to guide a NEW approach.
 
-  failure_experience_item:
+  exceed_max_turn_summary_item:
     components:
       - item_text
 
     required_context:
       - attempt_number
-      - failure_summary
+      - summary
 
     item_text: |
       [Attempt {{ attempt_number }}]
-      {{ failure_summary }}
+      {{ summary }}
 
-  failure_experience_footer:
+  exceed_max_turn_summary_footer:
     components:
       - footer_text
 
diff --git a/scripts/fangda_run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh b/scripts/fangda_run_evaluate_multiple_runs_mirothinker_gaia-validation-text-only.sh
@@ -7,7 +7,7 @@
 # Configuration parameters
 NUM_RUNS=3
 BENCHMARK_NAME="gaia-validation-text-only"
-AGENT_SET="fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback"
+AGENT_SET="fangda_agent_gaia-validation-text-only_mirothinker_single_agent_rollback_new_tools"
 MAX_CONCURRENT=30
 
 # Set results directory with timestamp
diff --git a/src/agents/iterative_agent.py b/src/agents/iterative_agent.py
@@ -152,7 +152,7 @@ async def run_internal(self, ctx: AgentContext) -> AgentContext:
             message_history=message_history,
             summary=output_processor_result.get("summary", None),
             final_boxed_answer=output_processor_result.get("final_boxed_answer", None),
-            failure_experience_summary=output_processor_result.get(
-                "failure_experience_summary", None
+            exceed_max_turn_summary=output_processor_result.get(
+                "exceed_max_turn_summary", None
             ),
         )
diff --git a/src/agents/iterative_agent_with_rollback.py b/src/agents/iterative_agent_with_rollback.py
@@ -246,7 +246,7 @@ async def run_internal(self, ctx: AgentContext) -> AgentContext:
             message_history=message_history,
             summary=output_processor_result.get("summary", None),
             final_boxed_answer=output_processor_result.get("final_boxed_answer", None),
-            failure_experience_summary=output_processor_result.get(
-                "failure_experience_summary", None
+            exceed_max_turn_summary=output_processor_result.get(
+                "exceed_max_turn_summary", None
             ),
         )
diff --git a/src/io_processor/__init__.py b/src/io_processor/__init__.py
@@ -5,11 +5,11 @@
 """IO processor module for input/output handling."""
 
 from src.io_processor.base import BaseIOProcessor
-from src.io_processor.failure_experience_generator import (
-    FailureExperienceSummaryGenerator,
+from src.io_processor.exceed_max_turn_summary_generator import (
+    ExceedMaxTurnSummaryGenerator,
 )
 
 __all__ = [
     "BaseIOProcessor",
-    "FailureExperienceSummaryGenerator",
+    "ExceedMaxTurnSummaryGenerator",
 ]
diff --git a/src/io_processor/exceed_max_turn_summary_generator.py b/src/io_processor/exceed_max_turn_summary_generator.py
@@ -3,23 +3,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Failure Experience Summary Generator - generates failure summaries for retry logic.
+Exceed Max Turn Summary Generator - generates summaries when task exceeds max turns without valid box.
 """
 
 from src.agents.context import AgentContext
 from src.io_processor.base import BaseIOProcessor
 from src.registry import ComponentType, register
 
 
-@register(ComponentType.IO_PROCESSOR, "FailureExperienceSummaryGenerator")
-class FailureExperienceSummaryGenerator(BaseIOProcessor):
-    """Generates failure experience summaries for pass@k retry logic."""
+@register(ComponentType.IO_PROCESSOR, "ExceedMaxTurnSummaryGenerator")
+class ExceedMaxTurnSummaryGenerator(BaseIOProcessor):
+    """Generates summaries for retry logic when task exceeds max turns without valid box."""
 
     USE_PROPAGATE_MODULE_CONFIGS = ("llm", "prompt")
 
     async def run_internal(self, ctx: AgentContext) -> AgentContext:
         prompt = self.prompt_manager.render_prompt(
-            "failure_summary_prompt",
+            "exceed_max_turn_summary_prompt",
             context=dict(
                 task_description=ctx.get("task_description"),
                 summary=ctx.get("summary", ""),
@@ -33,4 +33,4 @@ async def run_internal(self, ctx: AgentContext) -> AgentContext:
             + [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
         )
 
-        return AgentContext(failure_experience_summary=llm_response.response_text)
+        return AgentContext(exceed_max_turn_summary=llm_response.response_text)
diff --git a/src/logging/decorators.py b/src/logging/decorators.py
@@ -90,7 +90,8 @@ async def wrapper(*args: Any, **kwargs: Any):
 
             sp = Span(
                 task_id=task_context_var.task_id,
-                run_id=task_context_var.run_id,
+                attempt_id=task_context_var.attempt_id,
+                retry_id=task_context_var.retry_id,
                 span_id=span_id,
                 parent_span_id=parent_span_id,
                 name=span_name,
diff --git a/src/logging/span.py b/src/logging/span.py
@@ -21,7 +21,8 @@ class Span:
     name: str
     parent_span_id: Optional[str]
     task_id: Optional[str] = None
-    run_id: Optional[str] = None
+    attempt_id: Optional[int] = None
+    retry_id: Optional[int] = None
 
     start_ts: float = field(default_factory=time.time)
     end_ts: Optional[float] = None
diff --git a/src/logging/task_tracer.py b/src/logging/task_tracer.py
@@ -48,14 +48,15 @@ def _ensure_jsonable(x: Any) -> Any:
 @dataclass(frozen=True)
 class TaskContextVar:
     task_id: str
-    run_id: str
+    attempt_id: int
+    retry_id: int
 
     def __repr__(self) -> str:
-        return f"task_{self.task_id}_attempt_{self.run_id}"
+        return f"task_{self.task_id}_attempt_{self.attempt_id}_retry_{self.retry_id}"
 
 
 # 使用默认对象代替 None，避免后续大量的 None check
-ROOT_CONTEXT = TaskContextVar(task_id="root", run_id="0")
+ROOT_CONTEXT = TaskContextVar(task_id="root", attempt_id=0, retry_id=0)
 
 CURRENT_TASK_CONTEXT_VAR: contextvars.ContextVar[TaskContextVar] = (
     contextvars.ContextVar("CURRENT_TASK_CONTEXT_VAR", default=ROOT_CONTEXT)
@@ -81,7 +82,8 @@ def get_current_task_context_var() -> TaskContextVar:
 
 class TaskMeta(BaseModel):
     task_id: str = Field(default_factory=lambda: f"task_{uuid.uuid4().hex[:12]}")
-    run_id: str = Field(default_factory=lambda: f"run_{uuid.uuid4().hex[:12]}")
+    attempt_id: int = 1
+    retry_id: int = 0
     task_description: str = ""
     task_file_name: Optional[str] = None
 
@@ -97,10 +99,9 @@ class TaskMeta(BaseModel):
     ground_truth: Optional[str] = None
 
     is_valid_box: Optional[bool] = None
-    failure_experience_summary: Optional[str] = None
-    retry_with_experience: bool = False
-    previous_attempt_ids: List[int] = Field(default_factory=list)
-    stop_condition: Optional[str] = None
+    exceed_max_turn_summary: Optional[str] = None
+    used_exceed_max_turn_summaries: bool = False
+    previous_retry_ids: List[int] = Field(default_factory=list)
 
     updated_at: str = Field(default_factory=utc_iso)
 
diff --git a/src/utils/eval_utils.py b/src/utils/eval_utils.py
@@ -34,11 +34,6 @@
 STATUS_COMPLETED = "completed"
 STATUS_RESULT_JUDGED = "result_judged"
 
-# Stop condition constants
-STOP_CONDITION_CORRECT = "correct"
-STOP_CONDITION_VALID_BOX = "valid_box"
-STOP_CONDITION_MAX_TURN = "max_turn"
-
 # Invalid answer markers
 INVALID_ANSWER_MARKERS = [
     "NO_ANSWER",
@@ -79,12 +74,13 @@ class Task:
 
 
 class AttemptResult:
-    """Single attempt result for a benchmark task."""
+    """Single attempt result for a benchmark task (one retry within an attempt)."""
 
     def __init__(
         self,
         task: Task,
         attempt_id: int,
+        retry_id: int = 0,
         model_response: str = "",
         model_boxed_answer: str = "",
         status: str = STATUS_PENDING,
@@ -93,12 +89,12 @@ def __init__(
         is_correct: bool = False,
         error_message: Optional[str] = None,
         is_valid_box: bool = False,
-        failure_experience_summary: Optional[str] = None,
-        used_failure_experiences: Optional[List[str]] = None,
-        stop_reason: Optional[str] = None,
+        exceed_max_turn_summary: Optional[str] = None,
+        used_exceed_max_turn_summaries: Optional[List[str]] = None,
     ):
         self.task = task
         self.attempt_id = attempt_id
+        self.retry_id = retry_id
         self.model_response = model_response
         self.model_boxed_answer = model_boxed_answer
         self.status = status
@@ -107,15 +103,15 @@ def __init__(
         self.is_correct = is_correct
         self.error_message = error_message
         self.is_valid_box = is_valid_box
-        self.failure_experience_summary = failure_experience_summary
-        self.used_failure_experiences = used_failure_experiences or []
-        self.stop_reason = stop_reason
+        self.exceed_max_turn_summary = exceed_max_turn_summary
+        self.used_exceed_max_turn_summaries = used_exceed_max_turn_summaries or []
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary for serialization."""
         return {
             "task_id": self.task.task_id,
             "attempt_id": self.attempt_id,
+            "retry_id": self.retry_id,
             "model_response": self.model_response,
             "model_boxed_answer": self.model_boxed_answer,
             "status": self.status,
@@ -124,17 +120,16 @@ def to_dict(self) -> Dict[str, Any]:
             "is_correct": self.is_correct,
             "error_message": self.error_message,
             "is_valid_box": self.is_valid_box,
-            "failure_experience_summary": self.failure_experience_summary,
-            "used_failure_experiences": self.used_failure_experiences,
-            "stop_reason": self.stop_reason,
+            "exceed_max_turn_summary": self.exceed_max_turn_summary,
+            "used_exceed_max_turn_summaries": self.used_exceed_max_turn_summaries,
         }
 
     def update_from_response(self, response: Dict[str, Any], log_path: Path):
         """Update with response data from agent.run()."""
         self.model_response = response
         self.model_boxed_answer = response.get("final_boxed_answer", "")
         self.is_valid_box = is_valid_box(self.model_boxed_answer)
-        self.failure_experience_summary = response.get("failure_experience_summary")
+        self.exceed_max_turn_summary = response.get("exceed_max_turn_summary")
         self.status = STATUS_COMPLETED if self.model_boxed_answer else STATUS_FAILED
         self.log_path = log_path
 
@@ -182,8 +177,8 @@ def __init__(self, task: Task):
         self.log_path = None
         self.attempts = []
         self.pass_at_k_success = False
-        self.stop_condition: Optional[str] = None
-        self.total_failure_experiences: int = 0
+        self.total_attempts: int = 0
+        self.total_retries: int = 0
 
     def to_dict(self) -> Dict[str, Any]:
         """Convert to serializable dictionary."""
diff --git a/src/utils/task_utils.py b/src/utils/task_utils.py
diff --git a/test_benchmark.py b/test_benchmark.py
diff --git a/utils/check_gaia_validation_text_progress.py b/utils/check_gaia_validation_text_progress.py
diff --git a/utils/fangda_check_progress_gaia_validation_text_103.py b/utils/fangda_check_progress_gaia_validation_text_103.py

Original file line number	Diff line number	Diff line change
`@@ -5,11 +5,11 @@`
`5`	`5`	`"""IO processor module for input/output handling."""`
`6`	`6`
`7`	`7`	`from src.io_processor.base import BaseIOProcessor`
`8`		`-from src.io_processor.failure_experience_generator import (`
`9`		`- FailureExperienceSummaryGenerator,`
	`8`	`+from src.io_processor.exceed_max_turn_summary_generator import (`
	`9`	`+ ExceedMaxTurnSummaryGenerator,`
`10`	`10`	`)`
`11`	`11`
`12`	`12`	`__all__ = [`
`13`	`13`	`"BaseIOProcessor",`
`14`		`- "FailureExperienceSummaryGenerator",`
	`14`	`+ "ExceedMaxTurnSummaryGenerator",`
`15`	`15`	`]`