Skip to content

Commit 791df94

Browse files
committed
Add GenSelect plugin for solution selection
Introduces the GenSelect plugin implementing generative solution selection based on the AIMO-2 winning solution paper. Updates README with plugin documentation and reference, bumps version to 0.1.23, and adds a GenSelect math test case.
1 parent 26bb5c0 commit 791df94

File tree

4 files changed

+277
-1
lines changed

4 files changed

+277
-1
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ Check this log file for connection issues, tool execution errors, and other diag
377377
| Read URLs | `readurls` | Reads all URLs found in the request, fetches the content at the URL and adds it to the context |
378378
| Execute Code | `executecode` | Enables use of code interpreter to execute python code in requests and LLM generated responses |
379379
| JSON | `json` | Enables structured outputs using the outlines library, supports pydantic types and JSON schema |
380+
| GenSelect | `genselect` | Generative Solution Selection - generates multiple candidates and selects the best based on quality criteria |
380381

381382
## Available parameters
382383

@@ -587,6 +588,7 @@ called patchflows. We saw huge performance gains across all the supported patchf
587588
- [Unsupervised Evaluation of Code LLMs with Round-Trip Correctness](https://arxiv.org/abs/2402.08699) - [Inspired the implementation of rto](optillm/rto.py)
588589
- [Patched MOA: optimizing inference for diverse software development tasks](https://arxiv.org/abs/2407.18521) - [Implementation](optillm/moa.py)
589590
- [Patched RTC: evaluating LLMs for diverse software development tasks](https://arxiv.org/abs/2407.16557) - [Implementation](ptillm/rto.py)
591+
- [AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset](https://arxiv.org/abs/2504.16891) - [Implementation](optillm/plugins/genselect_plugin.py)
590592

591593
## Citation
592594

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
"""
2+
GenSelect Plugin for OptILLM
3+
4+
This plugin implements the Generative Solution Selection (GenSelect) approach from
5+
the paper "AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning
6+
Models with OpenMathReasoning dataset" (arXiv:2504.16891).
7+
8+
GenSelect generates multiple candidate solutions and uses an LLM to compare and
9+
select the best one based on quality criteria. Unlike majority voting which counts
10+
answer frequencies, GenSelect evaluates the entire response quality.
11+
"""
12+
13+
import logging
14+
from typing import Tuple, Dict, Any, List, Optional
15+
import json
16+
17+
logger = logging.getLogger(__name__)
18+
19+
# Plugin identifier
20+
SLUG = "genselect"
21+
22+
# Default configuration
23+
DEFAULT_NUM_CANDIDATES = 4
24+
DEFAULT_TEMPERATURE = 0.7
25+
DEFAULT_COMPARISON_TEMPERATURE = 0.3
26+
DEFAULT_COMPARISON_MODE = "batch" # "batch" or "tournament"
27+
DEFAULT_INCLUDE_REASONING = False
28+
29+
def create_comparison_prompt(candidates: List[str], query: str, comparison_mode: str = "batch") -> str:
30+
"""
31+
Create a prompt for comparing candidate solutions.
32+
33+
Args:
34+
candidates: List of candidate responses
35+
query: The original user query
36+
comparison_mode: "batch" for all at once, "tournament" for pairwise
37+
38+
Returns:
39+
The comparison prompt
40+
"""
41+
if comparison_mode == "batch":
42+
prompt = f"""You are an expert evaluator tasked with selecting the best response to the following query:
43+
44+
Query: {query}
45+
46+
I will provide you with {len(candidates)} different candidate responses. Please analyze each one carefully and select the best response based on the following criteria:
47+
48+
1. **Correctness and Accuracy**: Is the response factually correct and accurate?
49+
2. **Completeness**: Does it fully address all aspects of the query?
50+
3. **Clarity**: Is the explanation clear and easy to understand?
51+
4. **Logical Coherence**: Is the reasoning sound and well-structured?
52+
5. **Practical Value**: Does it provide useful, actionable information?
53+
54+
For coding problems, also consider:
55+
- Code correctness and efficiency
56+
- Best practices and style
57+
- Error handling
58+
59+
Here are the candidate responses:
60+
61+
"""
62+
for i, candidate in enumerate(candidates, 1):
63+
prompt += f"=== Candidate {i} ===\n{candidate}\n\n"
64+
65+
prompt += """Please analyze all candidates and provide:
66+
1. A brief comparison highlighting the strengths and weaknesses of each candidate
67+
2. Your selection of the best candidate (specify the number)
68+
3. A clear explanation of why you selected that candidate
69+
70+
Format your response as:
71+
COMPARISON:
72+
[Your comparison analysis]
73+
74+
BEST CANDIDATE: [number]
75+
76+
REASONING:
77+
[Your explanation for the selection]"""
78+
79+
else: # tournament mode - for future enhancement
80+
# This would implement pairwise comparisons
81+
# For now, we'll use batch mode as default
82+
return create_comparison_prompt(candidates, query, "batch")
83+
84+
return prompt
85+
86+
def parse_selection_response(response: str, num_candidates: int) -> Tuple[int, str]:
87+
"""
88+
Parse the selection response to extract the chosen candidate number and reasoning.
89+
90+
Args:
91+
response: The LLM's comparison response
92+
num_candidates: Total number of candidates
93+
94+
Returns:
95+
Tuple of (selected_index, reasoning)
96+
"""
97+
import re
98+
99+
# Look for "BEST CANDIDATE: X" pattern
100+
match = re.search(r'BEST CANDIDATE:\s*(\d+)', response, re.IGNORECASE)
101+
if match:
102+
candidate_num = int(match.group(1))
103+
# Convert to 0-based index
104+
selected_index = candidate_num - 1
105+
106+
# Validate the selection
107+
if 0 <= selected_index < num_candidates:
108+
# Extract reasoning if available
109+
reasoning_match = re.search(r'REASONING:\s*(.+)', response, re.IGNORECASE | re.DOTALL)
110+
reasoning = reasoning_match.group(1).strip() if reasoning_match else "No explicit reasoning provided"
111+
112+
logger.info(f"Selected candidate {candidate_num} based on comparison")
113+
return selected_index, reasoning
114+
115+
# Fallback: Look for other patterns like "Candidate X is the best"
116+
patterns = [
117+
r'[Cc]andidate\s+(\d+)\s+is\s+(?:the\s+)?best',
118+
r'[Ii]\s+(?:would\s+)?select\s+[Cc]andidate\s+(\d+)',
119+
r'[Tt]he\s+best\s+(?:response|candidate)\s+is\s+(?:number\s+)?(\d+)',
120+
]
121+
122+
for pattern in patterns:
123+
match = re.search(pattern, response)
124+
if match:
125+
candidate_num = int(match.group(1))
126+
selected_index = candidate_num - 1
127+
if 0 <= selected_index < num_candidates:
128+
logger.info(f"Selected candidate {candidate_num} using fallback pattern")
129+
return selected_index, "Selection extracted from response pattern"
130+
131+
# If no clear selection found, log warning and return first candidate
132+
logger.warning("Could not parse selection from comparison response, defaulting to first candidate")
133+
return 0, "Failed to parse selection, defaulted to first candidate"
134+
135+
def run(
136+
system_prompt: str,
137+
initial_query: str,
138+
client,
139+
model: str,
140+
request_config: Dict[str, Any] = None
141+
) -> Tuple[str, int]:
142+
"""
143+
Main entry point for the GenSelect plugin.
144+
145+
Generates multiple candidate solutions and uses LLM comparison to select the best one.
146+
147+
Args:
148+
system_prompt: System prompt for the model
149+
initial_query: User's query
150+
client: OpenAI-compatible client instance
151+
model: Model identifier
152+
request_config: Additional configuration parameters
153+
154+
Returns:
155+
Tuple of (response_text, completion_tokens_used)
156+
"""
157+
logger.info("Starting GenSelect process")
158+
159+
# Extract configuration
160+
config = request_config or {}
161+
num_candidates = config.get('num_candidates', DEFAULT_NUM_CANDIDATES)
162+
temperature = config.get('temperature', DEFAULT_TEMPERATURE)
163+
comparison_temperature = config.get('comparison_temperature', DEFAULT_COMPARISON_TEMPERATURE)
164+
comparison_mode = config.get('comparison_mode', DEFAULT_COMPARISON_MODE)
165+
include_reasoning = config.get('include_reasoning', DEFAULT_INCLUDE_REASONING)
166+
max_tokens = config.get('max_tokens', 4096)
167+
168+
# Validate num_candidates is in reasonable range (2-16 as per paper)
169+
num_candidates = max(2, min(16, num_candidates))
170+
171+
logger.info(f"Generating {num_candidates} candidates with temperature={temperature}")
172+
173+
# Prepare messages for candidate generation
174+
messages = [
175+
{"role": "system", "content": system_prompt},
176+
{"role": "user", "content": initial_query}
177+
]
178+
179+
candidates = []
180+
total_tokens = 0
181+
182+
try:
183+
# Try to generate candidates using n parameter for efficiency
184+
response = client.chat.completions.create(
185+
model=model,
186+
messages=messages,
187+
n=num_candidates,
188+
temperature=temperature,
189+
max_tokens=max_tokens
190+
)
191+
192+
candidates = [choice.message.content for choice in response.choices]
193+
total_tokens += response.usage.completion_tokens
194+
195+
logger.info(f"Generated {len(candidates)} candidates using n parameter. Tokens: {total_tokens}")
196+
197+
except Exception as e:
198+
logger.warning(f"n parameter not supported: {str(e)}")
199+
logger.info("Falling back to sequential generation")
200+
201+
# Fallback: Generate candidates one by one
202+
for i in range(num_candidates):
203+
try:
204+
response = client.chat.completions.create(
205+
model=model,
206+
messages=messages,
207+
temperature=temperature,
208+
max_tokens=max_tokens
209+
)
210+
candidates.append(response.choices[0].message.content)
211+
total_tokens += response.usage.completion_tokens
212+
logger.debug(f"Generated candidate {i+1}/{num_candidates}")
213+
214+
except Exception as gen_error:
215+
logger.error(f"Error generating candidate {i+1}: {str(gen_error)}")
216+
continue
217+
218+
if len(candidates) < 2:
219+
logger.error(f"Insufficient candidates generated ({len(candidates)})")
220+
if candidates:
221+
return candidates[0], total_tokens
222+
return "Error: Could not generate sufficient candidates for selection", total_tokens
223+
224+
# Create comparison prompt
225+
comparison_prompt = create_comparison_prompt(candidates, initial_query, comparison_mode)
226+
227+
# Get LLM to compare and select
228+
logger.info("Comparing candidates for selection")
229+
230+
try:
231+
comparison_messages = [
232+
{"role": "system", "content": "You are an expert evaluator skilled at comparing and selecting high-quality responses."},
233+
{"role": "user", "content": comparison_prompt}
234+
]
235+
236+
comparison_response = client.chat.completions.create(
237+
model=model,
238+
messages=comparison_messages,
239+
temperature=comparison_temperature,
240+
max_tokens=2048 # Comparison doesn't need as many tokens
241+
)
242+
243+
selection_response = comparison_response.choices[0].message.content
244+
total_tokens += comparison_response.usage.completion_tokens
245+
246+
# Parse the selection
247+
selected_index, reasoning = parse_selection_response(selection_response, len(candidates))
248+
249+
# Get the selected candidate
250+
selected_candidate = candidates[selected_index]
251+
252+
logger.info(f"GenSelect Summary:")
253+
logger.info(f" - Generated {len(candidates)} candidates")
254+
logger.info(f" - Selected candidate {selected_index + 1}")
255+
logger.info(f" - Total tokens used: {total_tokens}")
256+
257+
# Optionally include reasoning in the response
258+
if include_reasoning:
259+
final_response = f"{selected_candidate}\n\n---\n**GenSelect Reasoning**: {reasoning}"
260+
else:
261+
final_response = selected_candidate
262+
263+
return final_response, total_tokens
264+
265+
except Exception as e:
266+
logger.error(f"Error during comparison: {str(e)}")
267+
# Fallback to first candidate
268+
logger.warning("Falling back to first candidate due to comparison error")
269+
return candidates[0], total_tokens

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "optillm"
7-
version = "0.1.22"
7+
version = "0.1.23"
88
description = "An optimizing inference proxy for LLMs."
99
readme = "README.md"
1010
license = "Apache-2.0"

test_cases.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,10 @@
3333
"name" : "GH",
3434
"system_prompt" : "",
3535
"query" : "Find the largest possible real part of[(75+117i)z+\frac{96+144i}{z}]where z is a complex number with |z|=4"
36+
},
37+
{
38+
"name": "GenSelect Math",
39+
"system_prompt": "You are a helpful AI assistant with expertise in mathematical reasoning.",
40+
"query": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Explain your reasoning step by step."
3641
}
3742
]

0 commit comments

Comments
 (0)