Skip to content

Commit 90f9b5b

Browse files
committed
_llm_evaluate returns artifacts if LLM returned string responses
1 parent c779ac9 commit 90f9b5b

File tree

1 file changed

+33
-11
lines changed

1 file changed

+33
-11
lines changed

openevolve/evaluator.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -125,16 +125,32 @@ async def evaluate_program(
125125
eval_result = self._process_evaluation_result(result)
126126

127127
# Add LLM feedback if configured
128+
llm_eval_result = None
128129
if self.config.use_llm_feedback and self.llm_ensemble:
129-
feedback_metrics = await self._llm_evaluate(program_code)
130+
llm_result = await self._llm_evaluate(program_code)
131+
llm_eval_result = self._process_evaluation_result(llm_result)
130132

131133
# Combine metrics
132-
for name, value in feedback_metrics.items():
134+
for name, value in llm_result.metrics.items():
133135
eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
134136

135137
# Store artifacts if enabled and present
136-
if artifacts_enabled and eval_result.has_artifacts() and program_id:
137-
self._pending_artifacts[program_id] = eval_result.artifacts
138+
if (
139+
artifacts_enabled
140+
and (
141+
eval_result.has_artifacts()
142+
or (llm_eval_result and llm_eval_result.has_artifacts())
143+
)
144+
and program_id
145+
):
146+
self._pending_artifacts[program_id] = {}
147+
148+
# Merge eval_result artifacts with llm artifacts if they exist
149+
if eval_result.has_artifacts():
150+
self._pending_artifacts[program_id].update(eval_result.artifacts)
151+
152+
if llm_eval_result and llm_eval_result.has_artifacts():
153+
self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
138154

139155
elapsed = time.time() - start_time
140156
logger.info(
@@ -150,6 +166,7 @@ async def evaluate_program(
150166
logger.warning(
151167
f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
152168
)
169+
traceback.print_exc()
153170

154171
# Capture failure artifacts if enabled
155172
if artifacts_enabled and program_id:
@@ -396,6 +413,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
396413
json_pattern = r"```json\n(.*?)\n```"
397414
import re
398415

416+
artifacts = {}
399417
avg_metrics = {}
400418
for i, response in enumerate(responses):
401419
json_match = re.search(json_pattern, response, re.DOTALL)
@@ -414,12 +432,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
414432
# Parse JSON
415433
result = json.loads(json_str)
416434

417-
# Filter all non-numeric values
418-
metrics = {
419-
name: float(value)
420-
for name, value in result.items()
421-
if isinstance(value, (int, float))
422-
}
435+
# All non-numeric values are artifacts, all numeric values are metrics
436+
metrics = {}
437+
for key, value in result.items():
438+
if not isinstance(value, (int, float)):
439+
artifacts[key] = value
440+
else:
441+
metrics[key] = float(value)
423442

424443
# Weight of the model in the ensemble
425444
weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -431,7 +450,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
431450
else:
432451
avg_metrics[name] = value * weight
433452

434-
return avg_metrics
453+
return EvaluationResult(
454+
metrics=avg_metrics,
455+
artifacts=artifacts,
456+
)
435457

436458
except Exception as e:
437459
logger.warning(f"Error parsing LLM response: {str(e)}")

0 commit comments

Comments
 (0)