wip

codelion · codelion · commit 2e35fbbcfd21 · 2025-10-01T22:16:56.000+08:00
diff --git a/optillm/mars/answer_extraction.py b/optillm/mars/answer_extraction.py
@@ -128,7 +128,12 @@ def extract_generic_answer(text: str) -> str:
     """
     Extract answer for generic (non-code, non-math) problems
     Returns the last paragraph or sentence as the final answer
+    For proof-based problems, may return the full text if no clear answer section exists
     """
+    # Check if this looks like a proof problem (geometry, proofs, etc.)
+    proof_indicators = ['proof', 'QED', 'proven', 'demonstrate', 'show that', 'prove that']
+    is_proof = any(indicator.lower() in text.lower() for indicator in proof_indicators)
+
     # Try to find conclusion markers
     conclusion_markers = [
         'In conclusion',
@@ -148,11 +153,25 @@ def extract_generic_answer(text: str) -> str:
                 answer = parts[1].strip()
                 # Get first sentence/paragraph after marker
                 first_para = answer.split('\n\n')[0].strip()
-                logger.info(f"📝 EXTRACTION: Found answer after '{marker}' ({len(first_para)} chars)")
-                return first_para
+                if len(first_para) > 20:  # Ensure it's substantial
+                    logger.info(f"📝 EXTRACTION: Found answer after '{marker}' ({len(first_para)} chars)")
+                    return first_para
 
-    # Fallback: Return last paragraph
+    # For proof problems, return more context (last 2-3 paragraphs or full text if short)
     paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+
+    if is_proof and paragraphs:
+        # For proofs, include conclusion paragraphs (last 2-3 paragraphs)
+        if len(paragraphs) >= 3:
+            conclusion_text = '\n\n'.join(paragraphs[-3:])
+            logger.info(f"📝 EXTRACTION: Proof detected, using last 3 paragraphs ({len(conclusion_text)} chars)")
+            return conclusion_text
+        else:
+            # Short proof, return full text
+            logger.info(f"📝 EXTRACTION: Short proof detected, returning full text ({len(text)} chars)")
+            return text
+
+    # For non-proof problems, return last paragraph
     if paragraphs:
         final_para = paragraphs[-1]
         logger.info(f"📝 EXTRACTION: Using last paragraph ({len(final_para)} chars)")
diff --git a/optillm/mars/mars.py b/optillm/mars/mars.py
@@ -119,7 +119,13 @@ async def _run_mars_parallel(
     if use_lightweight:
         logger.info(f"⚡ CONFIG: Using LIGHTWEIGHT MARS config for coding (fast mode)")
 
-    # Override max_tokens from request_config if provided
+    # Override with mars_config if provided
+    if request_config and 'mars_config' in request_config:
+        mars_config = request_config['mars_config']
+        config.update(mars_config)
+        logger.info(f"⚙️  CONFIG: Applied mars_config overrides: {list(mars_config.keys())}")
+
+    # Override max_tokens from request_config if provided (backward compatibility)
     if request_config and 'max_tokens' in request_config:
         config['max_tokens'] = request_config['max_tokens']
         logger.info(f"⚙️  CONFIG: Using max_tokens from request: {config['max_tokens']}")
@@ -278,21 +284,27 @@ async def _run_mars_parallel(
 
         # Apply thinking tags if enabled
         if config.get('use_thinking_tags', True):
-            logger.info(f"📝 ANSWER EXTRACTION: Extracting clean answer with mode '{config.get('answer_extraction_mode', 'auto')}'")
+            try:
+                logger.info(f"📝 ANSWER EXTRACTION: Extracting clean answer with mode '{config.get('answer_extraction_mode', 'auto')}'")
 
-            # Extract clean answer from synthesis output
-            clean_answer = extract_clean_answer(
-                final_solution,
-                mode=config.get('answer_extraction_mode', 'auto')
-            )
+                # Extract clean answer from synthesis output
+                clean_answer = extract_clean_answer(
+                    final_solution,
+                    mode=config.get('answer_extraction_mode', 'auto')
+                )
 
-            logger.info(f"📝 ANSWER EXTRACTION: Extracted {len(clean_answer)} char answer from {len(final_solution)} char synthesis")
+                logger.info(f"📝 ANSWER EXTRACTION: Extracted {len(clean_answer)} char answer from {len(final_solution)} char synthesis")
 
-            # Wrap reasoning in thinking tags
-            formatted_output = wrap_with_thinking_tags(final_solution, clean_answer)
+                # Wrap reasoning in thinking tags
+                formatted_output = wrap_with_thinking_tags(final_solution, clean_answer)
 
-            logger.info(f"📝 ANSWER EXTRACTION: Final output: {len(formatted_output)} chars (with thinking tags)")
-            return formatted_output, total_reasoning_tokens
+                logger.info(f"📝 ANSWER EXTRACTION: Final output: {len(formatted_output)} chars (with thinking tags)")
+                return formatted_output, total_reasoning_tokens
+            except Exception as extract_error:
+                # If answer extraction fails, fall back to raw synthesis
+                logger.warning(f"⚠️  ANSWER EXTRACTION FAILED: {str(extract_error)}")
+                logger.warning(f"⚠️  Falling back to raw synthesis output ({len(final_solution)} chars)")
+                return final_solution, total_reasoning_tokens
         else:
             logger.info(f"📝 ANSWER EXTRACTION: Thinking tags disabled, returning raw synthesis")
             return final_solution, total_reasoning_tokens
diff --git a/optillm/mars/prompts.py b/optillm/mars/prompts.py
@@ -78,6 +78,14 @@
 
 Important: Preserve the depth and detail needed for complex problems. Do not over-condense - maintain all critical reasoning steps and justifications. If agents have extracted specific numerical answers, ensure these are preserved and clearly formatted in your final response.
 
+**CRITICAL FOR PROOF-BASED PROBLEMS (geometry, number theory, etc.):**
+- The final solution MUST be COMPLETE and SELF-CONTAINED
+- Include ALL lemmas, theorems, and intermediate results WITH FULL JUSTIFICATIONS
+- Do NOT reference earlier work or assume prior knowledge
+- Every step must be explicitly proven or justified
+- Do NOT abbreviate proofs or say "as shown above" - repeat all necessary reasoning
+- The evaluator will ONLY see your final solution, so it must stand alone
+
 Create the most robust and well-reasoned solution possible, drawing from the collective intelligence of all agents."""
 
 IMPROVEMENT_PROMPT = """You are tasked with improving a solution based on verification feedback.
diff --git a/scripts/eval_imo25_benchmark.py b/scripts/eval_imo25_benchmark.py
@@ -712,7 +712,19 @@ def main():
     print(f"Results will be saved to: {results_file}")
 
     # Prepare extra_body for approach
-    extra_body = {"optillm_approach": args.approach} if args.approach != "none" else None
+    # Special handling for MARS on IMO problems: disable thinking tags for proofs
+    if args.approach == "mars":
+        extra_body = {
+            "optillm_approach": "mars",
+            "mars_config": {
+                "use_thinking_tags": False,  # IMO proofs need full visibility to evaluator
+                "answer_extraction_mode": "none"  # Don't extract - proofs ARE the answer
+            }
+        }
+    elif args.approach != "none":
+        extra_body = {"optillm_approach": args.approach}
+    else:
+        extra_body = None
 
     # Evaluate each problem
     for problem_data in tqdm(problems_to_evaluate, desc="Solving IMO problems"):