@@ -125,16 +125,32 @@ async def evaluate_program(
125125 eval_result = self ._process_evaluation_result (result )
126126
127127 # Add LLM feedback if configured
128+ llm_eval_result = None
128129 if self .config .use_llm_feedback and self .llm_ensemble :
129- feedback_metrics = await self ._llm_evaluate (program_code )
130+ llm_result = await self ._llm_evaluate (program_code )
131+ llm_eval_result = self ._process_evaluation_result (llm_result )
130132
131133 # Combine metrics
132- for name , value in feedback_metrics .items ():
134+ for name , value in llm_result . metrics .items ():
133135 eval_result .metrics [f"llm_{ name } " ] = value * self .config .llm_feedback_weight
134136
135137 # Store artifacts if enabled and present
136- if artifacts_enabled and eval_result .has_artifacts () and program_id :
137- self ._pending_artifacts [program_id ] = eval_result .artifacts
138+ if (
139+ artifacts_enabled
140+ and (
141+ eval_result .has_artifacts ()
142+ or (llm_eval_result and llm_eval_result .has_artifacts ())
143+ )
144+ and program_id
145+ ):
146+ self ._pending_artifacts [program_id ] = {}
147+
148+ # Merge eval_result artifacts with llm artifacts if they exist
149+ if eval_result .has_artifacts ():
150+ self ._pending_artifacts [program_id ].update (eval_result .artifacts )
151+
152+ if llm_eval_result and llm_eval_result .has_artifacts ():
153+ self ._pending_artifacts [program_id ].update (llm_eval_result .artifacts )
138154
139155 elapsed = time .time () - start_time
140156 logger .info (
@@ -150,6 +166,7 @@ async def evaluate_program(
150166 logger .warning (
151167 f"Evaluation attempt { attempt + 1 } /{ self .config .max_retries + 1 } failed for program{ program_id_str } : { str (e )} "
152168 )
169+ traceback .print_exc ()
153170
154171 # Capture failure artifacts if enabled
155172 if artifacts_enabled and program_id :
@@ -396,6 +413,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
396413 json_pattern = r"```json\n(.*?)\n```"
397414 import re
398415
416+ artifacts = {}
399417 avg_metrics = {}
400418 for i , response in enumerate (responses ):
401419 json_match = re .search (json_pattern , response , re .DOTALL )
@@ -414,12 +432,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
414432 # Parse JSON
415433 result = json .loads (json_str )
416434
417- # Filter all non-numeric values
418- metrics = {
419- name : float (value )
420- for name , value in result .items ()
421- if isinstance (value , (int , float ))
422- }
435+ # All non-numeric values are artifacts, all numeric values are metrics
436+ metrics = {}
437+ for key , value in result .items ():
438+ if not isinstance (value , (int , float )):
439+ artifacts [key ] = value
440+ else :
441+ metrics [key ] = float (value )
423442
424443 # Weight of the model in the ensemble
425444 weight = self .llm_ensemble .weights [i ] if self .llm_ensemble .weights else 1.0
@@ -431,7 +450,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
431450 else :
432451 avg_metrics [name ] = value * weight
433452
434- return avg_metrics
453+ return EvaluationResult (
454+ metrics = avg_metrics ,
455+ artifacts = artifacts ,
456+ )
435457
436458 except Exception as e :
437459 logger .warning (f"Error parsing LLM response: { str (e )} " )
0 commit comments