@@ -125,16 +125,32 @@ async def evaluate_program(
125125 eval_result = self ._process_evaluation_result (result )
126126
127127 # Add LLM feedback if configured
128+ llm_eval_result = None
128129 if self .config .use_llm_feedback and self .llm_ensemble :
129- feedback_metrics = await self ._llm_evaluate (program_code )
130+ llm_result = await self ._llm_evaluate (program_code )
131+ llm_eval_result = self ._process_evaluation_result (llm_result )
130132
131133 # Combine metrics
132- for name , value in feedback_metrics .items ():
134+ for name , value in llm_result . metrics .items ():
133135 eval_result .metrics [f"llm_{ name } " ] = value * self .config .llm_feedback_weight
134136
135137 # Store artifacts if enabled and present
136- if artifacts_enabled and eval_result .has_artifacts () and program_id :
137- self ._pending_artifacts [program_id ] = eval_result .artifacts
138+ if (
139+ artifacts_enabled
140+ and (
141+ eval_result .has_artifacts ()
142+ or (llm_eval_result and llm_eval_result .has_artifacts ())
143+ )
144+ and program_id
145+ ):
146+ self ._pending_artifacts [program_id ] = {}
147+
148+ # Merge eval_result artifacts with llm artifacts if they exist
149+ if eval_result .has_artifacts ():
150+ self ._pending_artifacts [program_id ].update (eval_result .artifacts )
151+
152+ if llm_eval_result and llm_eval_result .has_artifacts ():
153+ self ._pending_artifacts [program_id ].update (llm_eval_result .artifacts )
138154
139155 elapsed = time .time () - start_time
140156 logger .info (
@@ -396,6 +412,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
396412 json_pattern = r"```json\n(.*?)\n```"
397413 import re
398414
415+ artifacts = {}
399416 avg_metrics = {}
400417 for i , response in enumerate (responses ):
401418 json_match = re .search (json_pattern , response , re .DOTALL )
@@ -414,12 +431,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
414431 # Parse JSON
415432 result = json .loads (json_str )
416433
417- # Filter all non-numeric values
418- metrics = {
419- name : float (value )
420- for name , value in result .items ()
421- if isinstance (value , (int , float ))
422- }
434+ # All non-numeric values are artifacts, all numeric values are metrics
435+ metrics = {}
436+ for key , value in result .items ():
437+ if not isinstance (value , (int , float )):
438+ artifacts [key ] = value
439+ else :
440+ metrics [key ] = float (value )
423441
424442 # Weight of the model in the ensemble
425443 weight = self .llm_ensemble .weights [i ] if self .llm_ensemble .weights else 1.0
@@ -431,7 +449,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
431449 else :
432450 avg_metrics [name ] = value * weight
433451
434- return avg_metrics
452+ return EvaluationResult (
453+ metrics = avg_metrics ,
454+ artifacts = artifacts ,
455+ )
435456
436457 except Exception as e :
437458 logger .warning (f"Error parsing LLM response: { str (e )} " )
0 commit comments