@@ -2858,56 +2858,56 @@ async def _process_experiment_item(
28582858 metadata = final_observation_metadata ,
28592859 )
28602860
2861- # Run evaluators
2862- evaluations = []
2861+ except Exception as e :
2862+ span .update (
2863+ output = f"Error: { str (e )} " , level = "ERROR" , status_message = str (e )
2864+ )
2865+ raise e
28632866
2864- for evaluator in evaluators :
2865- try :
2866- eval_metadata : Optional [Dict [str , Any ]] = None
2867+ # Run evaluators
2868+ evaluations = []
28672869
2868- if isinstance (item , dict ):
2869- eval_metadata = item .get ("metadata" )
2870- elif hasattr (item , "metadata" ):
2871- eval_metadata = item .metadata
2870+ for evaluator in evaluators :
2871+ try :
2872+ eval_metadata : Optional [Dict [str , Any ]] = None
28722873
2873- eval_results = await _run_evaluator (
2874- evaluator ,
2875- input = input_data ,
2876- output = output ,
2877- expected_output = expected_output ,
2878- metadata = eval_metadata ,
2879- )
2880- evaluations .extend (eval_results )
2881-
2882- # Store evaluations as scores
2883- for evaluation in eval_results :
2884- self .create_score (
2885- trace_id = trace_id ,
2886- observation_id = span .id ,
2887- name = evaluation .name ,
2888- value = evaluation .value , # type: ignore
2889- comment = evaluation .comment ,
2890- metadata = evaluation .metadata ,
2891- config_id = evaluation .config_id ,
2892- data_type = evaluation .data_type , # type: ignore
2893- )
2874+ if isinstance (item , dict ):
2875+ eval_metadata = item .get ("metadata" )
2876+ elif hasattr (item , "metadata" ):
2877+ eval_metadata = item .metadata
28942878
2895- except Exception as e :
2896- langfuse_logger .error (f"Evaluator failed: { e } " )
2879+ eval_results = await _run_evaluator (
2880+ evaluator ,
2881+ input = input_data ,
2882+ output = output ,
2883+ expected_output = expected_output ,
2884+ metadata = eval_metadata ,
2885+ )
2886+ evaluations .extend (eval_results )
2887+
2888+ # Store evaluations as scores
2889+ for evaluation in eval_results :
2890+ self .create_score (
2891+ trace_id = trace_id ,
2892+ observation_id = span .id ,
2893+ name = evaluation .name ,
2894+ value = evaluation .value , # type: ignore
2895+ comment = evaluation .comment ,
2896+ metadata = evaluation .metadata ,
2897+ config_id = evaluation .config_id ,
2898+ data_type = evaluation .data_type , # type: ignore
2899+ )
28972900
2898- return ExperimentItemResult (
2899- item = item ,
2900- output = output ,
2901- evaluations = evaluations ,
2902- trace_id = trace_id ,
2903- dataset_run_id = dataset_run_id ,
2904- )
2901+ except Exception as e :
2902+ langfuse_logger .error (f"Evaluator failed: { e } " )
29052903
2906- except Exception as e :
2907- span .update (
2908- output = f"Error: { str (e )} " , level = "ERROR" , status_message = str (e )
2909- )
2910- raise e
2904+ return ExperimentItemResult (
2905+ item = item ,
2906+ output = output ,
2907+ evaluations = evaluations ,
2908+ trace_id = trace_id ,
2909+ dataset_run_id = dataset_run_id ,
2910+ )
29112911
29122912 def _create_experiment_run_name (
29132913 self , * , name : Optional [str ] = None , run_name : Optional [str ] = None
0 commit comments