| 
 | 1 | +package org.springframework.ai.evaluation;  | 
 | 2 | + | 
 | 3 | +import java.util.Collections;  | 
 | 4 | + | 
 | 5 | +import org.springframework.ai.chat.client.ChatClient;  | 
 | 6 | + | 
 | 7 | +/**  | 
 | 8 | + * Evaluates the correctness of a generated answer.  | 
 | 9 | + *  | 
 | 10 | + * The evaluator relies on a reference answer to judge the correctness of the generated  | 
 | 11 | + * answer.  | 
 | 12 | + *  | 
 | 13 | + * The evaluation response includes a score between 1 and 5, where 1 is the worst and 5 is  | 
 | 14 | + * the best. The evaluator also provides reasoning for the score.  | 
 | 15 | + *  | 
 | 16 | + * Passing is determined by the score being greater than or equal to a threshold.  | 
 | 17 | + *  | 
 | 18 | + * @author Craig Walls  | 
 | 19 | + * @since 1.0.0 M1  | 
 | 20 | + */  | 
 | 21 | +public class CorrectnessEvaluator implements Evaluator {  | 
 | 22 | + | 
 | 23 | +	private static final String DEFAULT_REFERENCE_ANSWER = "(NO REFERENCE ANSWER SUPPLIED)";  | 
 | 24 | + | 
 | 25 | +	private static final String DEFAULT_SYSTEM_PROMPT_TEXT = """  | 
 | 26 | +			    You are an expert evaluation system for a question answering chatbot.  | 
 | 27 | +
  | 
 | 28 | +			    You are given the following information:  | 
 | 29 | +			    - a user query, and  | 
 | 30 | +			    - a generated answer  | 
 | 31 | +
  | 
 | 32 | +			    You may also be given a reference answer to use for reference in your evaluation.  | 
 | 33 | +
  | 
 | 34 | +			    Your job is to judge the relevance and correctness of the generated answer.  | 
 | 35 | +			    Output a single score that represents a holistic evaluation.  | 
 | 36 | +			    You must return your response in a line with only the score.  | 
 | 37 | +			    Do not return answers in any other format.  | 
 | 38 | +			    On a separate line provide your reasoning for the score as well.  | 
 | 39 | +
  | 
 | 40 | +			    Follow these guidelines for scoring:  | 
 | 41 | +			    - Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.  | 
 | 42 | +			    - If the generated answer is not relevant to the user query,  | 
 | 43 | +			    you should give a score of 1.  | 
 | 44 | +			    - If the generated answer is relevant but contains mistakes,  | 
 | 45 | +			    you should give a score between 2 and 3.  | 
 | 46 | +			    - If the generated answer is relevant and fully correct,  | 
 | 47 | +			    you should give a score between 4 and 5.  | 
 | 48 | +
  | 
 | 49 | +			    Example Response:  | 
 | 50 | +			    4.0  | 
 | 51 | +			    The generated answer has the exact same metrics as the reference answer,  | 
 | 52 | +			    but it is not as concise.  | 
 | 53 | +			""";  | 
 | 54 | + | 
 | 55 | +	private static final String DEFAULT_USER_PROMPT_TEMPLATE = """  | 
 | 56 | +			    ## User Query  | 
 | 57 | +			    {query}  | 
 | 58 | +
  | 
 | 59 | +			    ## Reference Answer  | 
 | 60 | +			    {reference_answer}  | 
 | 61 | +
  | 
 | 62 | +			    ## Generated Answer  | 
 | 63 | +			    {generated_answer}  | 
 | 64 | +			""";  | 
 | 65 | + | 
 | 66 | +	private final ChatClient.Builder chatClientBuilder;  | 
 | 67 | + | 
 | 68 | +	private float scoreThreshold = 4.0f;  | 
 | 69 | + | 
 | 70 | +	public CorrectnessEvaluator(ChatClient.Builder chatClientBuilder, float scoreThreshold) {  | 
 | 71 | +		this.chatClientBuilder = chatClientBuilder;  | 
 | 72 | +		this.scoreThreshold = scoreThreshold;  | 
 | 73 | +	}  | 
 | 74 | + | 
 | 75 | +	@Override  | 
 | 76 | +	public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {  | 
 | 77 | +		final String referenceAnswer = (evaluationRequest instanceof CorrectnessEvaluationRequest)  | 
 | 78 | +				? ((CorrectnessEvaluationRequest) evaluationRequest).getReferenceAnswer() : DEFAULT_REFERENCE_ANSWER;  | 
 | 79 | + | 
 | 80 | +		var query = evaluationRequest.getUserText();  | 
 | 81 | +		var generatedAnswer = evaluationRequest.getResponseContent();  | 
 | 82 | + | 
 | 83 | +		CorrectnessEvaluation evaluationResult = this.chatClientBuilder.build()  | 
 | 84 | +			.prompt()  | 
 | 85 | +			.system(systemSpec -> systemSpec.text(DEFAULT_SYSTEM_PROMPT_TEXT))  | 
 | 86 | +			.user(userSpec -> userSpec.text(DEFAULT_USER_PROMPT_TEMPLATE)  | 
 | 87 | +				.param("query", query)  | 
 | 88 | +				.param("reference_answer", referenceAnswer)  | 
 | 89 | +				.param("generated_answer", generatedAnswer))  | 
 | 90 | +			.call()  | 
 | 91 | +			.entity(CorrectnessEvaluation.class);  | 
 | 92 | + | 
 | 93 | +		boolean passing = evaluationResult.score() >= this.scoreThreshold;  | 
 | 94 | + | 
 | 95 | +		return new EvaluationResponse(passing, evaluationResult.score(), evaluationResult.reasoning(),  | 
 | 96 | +				Collections.emptyMap());  | 
 | 97 | +	}  | 
 | 98 | + | 
 | 99 | +	private record CorrectnessEvaluation(float score, String reasoning) {  | 
 | 100 | +	}  | 
 | 101 | + | 
 | 102 | +}  | 
0 commit comments