Skip to content

Commit 2e35fbb

Browse files
committed
wip
1 parent d9c04b5 commit 2e35fbb

File tree

4 files changed

+67
-16
lines changed

4 files changed

+67
-16
lines changed

optillm/mars/answer_extraction.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,12 @@ def extract_generic_answer(text: str) -> str:
128128
"""
129129
Extract answer for generic (non-code, non-math) problems
130130
Returns the last paragraph or sentence as the final answer
131+
For proof-based problems, may return the full text if no clear answer section exists
131132
"""
133+
# Check if this looks like a proof problem (geometry, proofs, etc.)
134+
proof_indicators = ['proof', 'QED', 'proven', 'demonstrate', 'show that', 'prove that']
135+
is_proof = any(indicator.lower() in text.lower() for indicator in proof_indicators)
136+
132137
# Try to find conclusion markers
133138
conclusion_markers = [
134139
'In conclusion',
@@ -148,11 +153,25 @@ def extract_generic_answer(text: str) -> str:
148153
answer = parts[1].strip()
149154
# Get first sentence/paragraph after marker
150155
first_para = answer.split('\n\n')[0].strip()
151-
logger.info(f"📝 EXTRACTION: Found answer after '{marker}' ({len(first_para)} chars)")
152-
return first_para
156+
if len(first_para) > 20: # Ensure it's substantial
157+
logger.info(f"📝 EXTRACTION: Found answer after '{marker}' ({len(first_para)} chars)")
158+
return first_para
153159

154-
# Fallback: Return last paragraph
160+
# For proof problems, return more context (last 2-3 paragraphs or full text if short)
155161
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
162+
163+
if is_proof and paragraphs:
164+
# For proofs, include conclusion paragraphs (last 2-3 paragraphs)
165+
if len(paragraphs) >= 3:
166+
conclusion_text = '\n\n'.join(paragraphs[-3:])
167+
logger.info(f"📝 EXTRACTION: Proof detected, using last 3 paragraphs ({len(conclusion_text)} chars)")
168+
return conclusion_text
169+
else:
170+
# Short proof, return full text
171+
logger.info(f"📝 EXTRACTION: Short proof detected, returning full text ({len(text)} chars)")
172+
return text
173+
174+
# For non-proof problems, return last paragraph
156175
if paragraphs:
157176
final_para = paragraphs[-1]
158177
logger.info(f"📝 EXTRACTION: Using last paragraph ({len(final_para)} chars)")

optillm/mars/mars.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,13 @@ async def _run_mars_parallel(
119119
if use_lightweight:
120120
logger.info(f"⚡ CONFIG: Using LIGHTWEIGHT MARS config for coding (fast mode)")
121121

122-
# Override max_tokens from request_config if provided
122+
# Override with mars_config if provided
123+
if request_config and 'mars_config' in request_config:
124+
mars_config = request_config['mars_config']
125+
config.update(mars_config)
126+
logger.info(f"⚙️ CONFIG: Applied mars_config overrides: {list(mars_config.keys())}")
127+
128+
# Override max_tokens from request_config if provided (backward compatibility)
123129
if request_config and 'max_tokens' in request_config:
124130
config['max_tokens'] = request_config['max_tokens']
125131
logger.info(f"⚙️ CONFIG: Using max_tokens from request: {config['max_tokens']}")
@@ -278,21 +284,27 @@ async def _run_mars_parallel(
278284

279285
# Apply thinking tags if enabled
280286
if config.get('use_thinking_tags', True):
281-
logger.info(f"📝 ANSWER EXTRACTION: Extracting clean answer with mode '{config.get('answer_extraction_mode', 'auto')}'")
287+
try:
288+
logger.info(f"📝 ANSWER EXTRACTION: Extracting clean answer with mode '{config.get('answer_extraction_mode', 'auto')}'")
282289

283-
# Extract clean answer from synthesis output
284-
clean_answer = extract_clean_answer(
285-
final_solution,
286-
mode=config.get('answer_extraction_mode', 'auto')
287-
)
290+
# Extract clean answer from synthesis output
291+
clean_answer = extract_clean_answer(
292+
final_solution,
293+
mode=config.get('answer_extraction_mode', 'auto')
294+
)
288295

289-
logger.info(f"📝 ANSWER EXTRACTION: Extracted {len(clean_answer)} char answer from {len(final_solution)} char synthesis")
296+
logger.info(f"📝 ANSWER EXTRACTION: Extracted {len(clean_answer)} char answer from {len(final_solution)} char synthesis")
290297

291-
# Wrap reasoning in thinking tags
292-
formatted_output = wrap_with_thinking_tags(final_solution, clean_answer)
298+
# Wrap reasoning in thinking tags
299+
formatted_output = wrap_with_thinking_tags(final_solution, clean_answer)
293300

294-
logger.info(f"📝 ANSWER EXTRACTION: Final output: {len(formatted_output)} chars (with thinking tags)")
295-
return formatted_output, total_reasoning_tokens
301+
logger.info(f"📝 ANSWER EXTRACTION: Final output: {len(formatted_output)} chars (with thinking tags)")
302+
return formatted_output, total_reasoning_tokens
303+
except Exception as extract_error:
304+
# If answer extraction fails, fall back to raw synthesis
305+
logger.warning(f"⚠️ ANSWER EXTRACTION FAILED: {str(extract_error)}")
306+
logger.warning(f"⚠️ Falling back to raw synthesis output ({len(final_solution)} chars)")
307+
return final_solution, total_reasoning_tokens
296308
else:
297309
logger.info(f"📝 ANSWER EXTRACTION: Thinking tags disabled, returning raw synthesis")
298310
return final_solution, total_reasoning_tokens

optillm/mars/prompts.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,14 @@
7878
7979
Important: Preserve the depth and detail needed for complex problems. Do not over-condense - maintain all critical reasoning steps and justifications. If agents have extracted specific numerical answers, ensure these are preserved and clearly formatted in your final response.
8080
81+
**CRITICAL FOR PROOF-BASED PROBLEMS (geometry, number theory, etc.):**
82+
- The final solution MUST be COMPLETE and SELF-CONTAINED
83+
- Include ALL lemmas, theorems, and intermediate results WITH FULL JUSTIFICATIONS
84+
- Do NOT reference earlier work or assume prior knowledge
85+
- Every step must be explicitly proven or justified
86+
- Do NOT abbreviate proofs or say "as shown above" - repeat all necessary reasoning
87+
- The evaluator will ONLY see your final solution, so it must stand alone
88+
8189
Create the most robust and well-reasoned solution possible, drawing from the collective intelligence of all agents."""
8290

8391
IMPROVEMENT_PROMPT = """You are tasked with improving a solution based on verification feedback.

scripts/eval_imo25_benchmark.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -712,7 +712,19 @@ def main():
712712
print(f"Results will be saved to: {results_file}")
713713

714714
# Prepare extra_body for approach
715-
extra_body = {"optillm_approach": args.approach} if args.approach != "none" else None
715+
# Special handling for MARS on IMO problems: disable thinking tags for proofs
716+
if args.approach == "mars":
717+
extra_body = {
718+
"optillm_approach": "mars",
719+
"mars_config": {
720+
"use_thinking_tags": False, # IMO proofs need full visibility to evaluator
721+
"answer_extraction_mode": "none" # Don't extract - proofs ARE the answer
722+
}
723+
}
724+
elif args.approach != "none":
725+
extra_body = {"optillm_approach": args.approach}
726+
else:
727+
extra_body = None
716728

717729
# Evaluate each problem
718730
for problem_data in tqdm(problems_to_evaluate, desc="Solving IMO problems"):

0 commit comments

Comments
 (0)