diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java new file mode 100644 index 00000000000..9428310a756 --- /dev/null +++ b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java @@ -0,0 +1,27 @@ +package org.springframework.ai.evaluation; + +import java.util.List; + +import org.springframework.ai.model.Content; + +/** + * Represents an evaluation request for correctness evaluation. + * + * @author Craig Walls + * @since 1.0.0 M2 + */ +public class CorrectnessEvaluationRequest extends EvaluationRequest { + + private final String referenceAnswer; + + public CorrectnessEvaluationRequest(String userText, List dataList, String responseContent, + String referenceAnswer) { + super(userText, dataList, responseContent); + this.referenceAnswer = referenceAnswer; + } + + public String getReferenceAnswer() { + return referenceAnswer; + } + +} diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java new file mode 100644 index 00000000000..cd61617d672 --- /dev/null +++ b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java @@ -0,0 +1,99 @@ +package org.springframework.ai.evaluation; + +import java.util.Collections; + +import org.springframework.ai.chat.client.ChatClient; + +/** + * Evaluates the correctness of a generated answer. + * + * The evaluator relies on a reference answer to judge the correctness of the generated + * answer. + * + * The evaluation response includes a score between 1 and 5, where 1 is the worst and 5 is + * the best. The evaluator also provides reasoning for the score. + * + * Passing is determined by the score being greater than or equal to a threshold. + * + * @author Craig Walls + * @since 1.0.0 M2 + */ +public class CorrectnessEvaluator implements Evaluator { + + private static final String DEFAULT_REFERENCE_ANSWER = "(NO REFERENCE ANSWER SUPPLIED)"; + + private static final String DEFAULT_SYSTEM_PROMPT_TEXT = """ + You are an expert evaluation system for a question answering chatbot. + + You are given the following information: + - a user query, and + - a generated answer + + You may also be given a reference answer to use for reference in your evaluation. + + Your job is to judge the relevance and correctness of the generated answer. + Output a single score that represents a holistic evaluation. + + Follow these guidelines for scoring: + - Your score has to be between 1 and 5, where 1 is the worst and 5 is the best. + - If the generated answer is not relevant to the user query, + you should give a score of 1. + - If the generated answer is relevant but contains mistakes, + you should give a score between 2 and 3. + - If the generated answer is relevant and fully correct, + you should give a score between 4 and 5. + + Example Response: + 4.0 + The generated answer has the exact same metrics as the reference answer, + but it is not as concise. + """; + + private static final String DEFAULT_USER_PROMPT_TEMPLATE = """ + ## User Query + {query} + + ## Reference Answer + {reference_answer} + + ## Generated Answer + {generated_answer} + """; + + private final ChatClient.Builder chatClientBuilder; + + private float scoreThreshold = 4.0f; + + public CorrectnessEvaluator(ChatClient.Builder chatClientBuilder, float scoreThreshold) { + this.chatClientBuilder = chatClientBuilder; + this.scoreThreshold = scoreThreshold; + } + + @Override + public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) { + final String referenceAnswer = (evaluationRequest instanceof CorrectnessEvaluationRequest) + ? ((CorrectnessEvaluationRequest) evaluationRequest).getReferenceAnswer() : DEFAULT_REFERENCE_ANSWER; + + var query = evaluationRequest.getUserText(); + var generatedAnswer = evaluationRequest.getResponseContent(); + + CorrectnessEvaluation evaluationResult = this.chatClientBuilder.build() + .prompt() + .system(systemSpec -> systemSpec.text(DEFAULT_SYSTEM_PROMPT_TEXT)) + .user(userSpec -> userSpec.text(DEFAULT_USER_PROMPT_TEMPLATE) + .param("query", query) + .param("reference_answer", referenceAnswer) + .param("generated_answer", generatedAnswer)) + .call() + .entity(CorrectnessEvaluation.class); + + boolean passing = evaluationResult.score() >= this.scoreThreshold; + + return new EvaluationResponse(passing, evaluationResult.score(), evaluationResult.reasoning(), + Collections.emptyMap()); + } + + private record CorrectnessEvaluation(float score, String reasoning) { + } + +} diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc index 5c1087ebfb6..fcb652998fe 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc @@ -103,6 +103,7 @@ The 'claim' and 'document' are presented to the AI model for evaluation. Smaller === Usage The FactCheckingEvaluator constructor takes a ChatClient.Builder as a parameter: + [source,java] ---- public FactCheckingEvaluator(ChatClient.Builder chatClientBuilder) { @@ -147,4 +148,72 @@ void testFactChecking() { assertFalse(evaluationResponse.isPass(), "The claim should not be supported by the context"); } ----- \ No newline at end of file +---- + +== CorrectnessEvaluator + +Whereas `FactCheckingEvaluator` establishes if the generated content is factual given some context data, `CorrectnessEvaluator` determines if the generated content is correct, as compared with a reference answer that is correct. It also produces a score (with a range of 1 to 5) the gauge how correct the generated content is. + +The `CorrectnessEvaluator` submits the following system prompt to the AI model as guidelines for determining correctness: + +[source,text] +---- +You are an expert evaluation system for a question answering chatbot. +You are given the following information: +- a user query, and +- a generated answer +You may also be given a reference answer to use for reference in your evaluation. +Your job is to judge the relevance and correctness of the generated answer. +Output a single score that represents a holistic evaluation. +Follow these guidelines for scoring: +- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best. +- If the generated answer is not relevant to the user query, +you should give a score of 1. +- If the generated answer is relevant but contains mistakes, +you should give a score between 2 and 3. +- If the generated answer is relevant and fully correct, +you should give a score between 4 and 5. +Example Response: +4.0 +The generated answer has the exact same metrics as the reference answer, +but it is not as concise. +---- + +Along with the system prompt, the query input, generated answer, and the reference answer are provided in the user prompt: + +[source,text] +---- +{query} +## Reference Answer +{reference_answer} +## Generated Answer +{generated_answer} +---- + +Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text. + +[source,java] +---- +@Test +void testEvaluation() { + String userText = "Why is the sky blue?"; + + ChatResponse response = ChatClient.builder(chatModel) + .build().prompt() + .user(userText) + .call() + .chatResponse(); + + var correctnessEvaluator = new CorrectnessEvaluator(ChatClient.builder(chatModel), 3.5f); + + EvaluationResponse evaluationResponse = correctnessEvaluator.evaluate( + new EvaluationRequest( + question, + List.of(), + "Light scattering makes the sky blue.")); + + assertTrue(evaluationResponse.isPass(), "Response is incorrect"); +} +---- + +The `CorrectnessEvaluator` is created with a `ChatClient` as well as a threshold that the score must be greater than or equal to in order for the evaluation to be considered correct.