spring-projects · habuma · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jul 1, 2024
diff --git a/...ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java b/...ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java
@@ -0,0 +1,27 @@
+package org.springframework.ai.evaluation;
+
+import java.util.List;
+
+import org.springframework.ai.model.Content;
+
+/**
+ * Represents an evaluation request for correctness evaluation.
+ *
+ * @author Craig Walls
+ * @since 1.0.0 M2
+ */
+public class CorrectnessEvaluationRequest extends EvaluationRequest {
+
+	private final String referenceAnswer;
+
+	public CorrectnessEvaluationRequest(String userText, List<Content> dataList, String responseContent,
+			String referenceAnswer) {
+		super(userText, dataList, responseContent);
+		this.referenceAnswer = referenceAnswer;
+	}
+
+	public String getReferenceAnswer() {
+		return referenceAnswer;
+	}
+
+}
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java
@@ -0,0 +1,99 @@
+package org.springframework.ai.evaluation;
+
+import java.util.Collections;
+
+import org.springframework.ai.chat.client.ChatClient;
+
+/**
+ * Evaluates the correctness of a generated answer.
+ *
+ * The evaluator relies on a reference answer to judge the correctness of the generated
+ * answer.
+ *
+ * The evaluation response includes a score between 1 and 5, where 1 is the worst and 5 is
+ * the best. The evaluator also provides reasoning for the score.
+ *
+ * Passing is determined by the score being greater than or equal to a threshold.
+ *
+ * @author Craig Walls
+ * @since 1.0.0 M2
+ */
+public class CorrectnessEvaluator implements Evaluator {
+
+	private static final String DEFAULT_REFERENCE_ANSWER = "(NO REFERENCE ANSWER SUPPLIED)";
+
+	private static final String DEFAULT_SYSTEM_PROMPT_TEXT = """
+			    You are an expert evaluation system for a question answering chatbot.
+
+			    You are given the following information:
+			    - a user query, and
+			    - a generated answer
+
+			    You may also be given a reference answer to use for reference in your evaluation.
+
+			    Your job is to judge the relevance and correctness of the generated answer.
+			    Output a single score that represents a holistic evaluation.
+
+			    Follow these guidelines for scoring:
+			    - Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
+			    - If the generated answer is not relevant to the user query,
+			    you should give a score of 1.
+			    - If the generated answer is relevant but contains mistakes,
+			    you should give a score between 2 and 3.
+			    - If the generated answer is relevant and fully correct,
+			    you should give a score between 4 and 5.
+
+			    Example Response:
+			    4.0
+			    The generated answer has the exact same metrics as the reference answer,
+			    but it is not as concise.
+			""";
+
+	private static final String DEFAULT_USER_PROMPT_TEMPLATE = """
+			    ## User Query
+			    {query}
+
+			    ## Reference Answer
+			    {reference_answer}
+
+			    ## Generated Answer
+			    {generated_answer}
+			""";
+
+	private final ChatClient.Builder chatClientBuilder;
+
+	private float scoreThreshold = 4.0f;
+
+	public CorrectnessEvaluator(ChatClient.Builder chatClientBuilder, float scoreThreshold) {
+		this.chatClientBuilder = chatClientBuilder;
+		this.scoreThreshold = scoreThreshold;
+	}
+
+	@Override
+	public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
+		final String referenceAnswer = (evaluationRequest instanceof CorrectnessEvaluationRequest)
+				? ((CorrectnessEvaluationRequest) evaluationRequest).getReferenceAnswer() : DEFAULT_REFERENCE_ANSWER;
+
+		var query = evaluationRequest.getUserText();
+		var generatedAnswer = evaluationRequest.getResponseContent();
+
+		CorrectnessEvaluation evaluationResult = this.chatClientBuilder.build()
+			.prompt()
+			.system(systemSpec -> systemSpec.text(DEFAULT_SYSTEM_PROMPT_TEXT))
+			.user(userSpec -> userSpec.text(DEFAULT_USER_PROMPT_TEMPLATE)
+				.param("query", query)
+				.param("reference_answer", referenceAnswer)
+				.param("generated_answer", generatedAnswer))
+			.call()
+			.entity(CorrectnessEvaluation.class);
+
+		boolean passing = evaluationResult.score() >= this.scoreThreshold;
+
+		return new EvaluationResponse(passing, evaluationResult.score(), evaluationResult.reasoning(),
+				Collections.emptyMap());
+	}
+
+	private record CorrectnessEvaluation(float score, String reasoning) {
+	}
+
+}
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc
@@ -103,6 +103,7 @@ The 'claim' and 'document' are presented to the AI model for evaluation. Smaller
 
 === Usage
 The FactCheckingEvaluator constructor takes a ChatClient.Builder as a parameter:
+
 [source,java]
 ----
 public FactCheckingEvaluator(ChatClient.Builder chatClientBuilder) {
@@ -147,4 +148,72 @@ void testFactChecking() {
   assertFalse(evaluationResponse.isPass(), "The claim should not be supported by the context");
 
 }
-----
+----
+
+== CorrectnessEvaluator
+
+Whereas `FactCheckingEvaluator` establishes if the generated content is factual given some context data, `CorrectnessEvaluator` determines if the generated content is correct, as compared with a reference answer that is correct. It also produces a score (with a range of 1 to 5) the gauge how correct the generated content is.
+
+The `CorrectnessEvaluator` submits the following system prompt to the AI model as guidelines for determining correctness:
+
+[source,text]
+----
+You are an expert evaluation system for a question answering chatbot.
+You are given the following information:
+- a user query, and
+- a generated answer
+You may also be given a reference answer to use for reference in your evaluation.
+Your job is to judge the relevance and correctness of the generated answer.
+Output a single score that represents a holistic evaluation.
+Follow these guidelines for scoring:
+- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
+- If the generated answer is not relevant to the user query,
+you should give a score of 1.
+- If the generated answer is relevant but contains mistakes,
+you should give a score between 2 and 3.
+- If the generated answer is relevant and fully correct,
+you should give a score between 4 and 5.
+Example Response:
+4.0
+The generated answer has the exact same metrics as the reference answer,
+but it is not as concise.
+----
+
+Along with the system prompt, the query input, generated answer, and the reference answer are provided in the user prompt:
+
+[source,text]
+----
+{query}
+## Reference Answer
+{reference_answer}
+## Generated Answer
+{generated_answer}
+----
+
+Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text.
+
+[source,java]
+----
+@Test
+void testEvaluation() {
+    String userText = "Why is the sky blue?";
+
+    ChatResponse response = ChatClient.builder(chatModel)
+            .build().prompt()
+            .user(userText)
+            .call()
+            .chatResponse();
+
+    var correctnessEvaluator = new CorrectnessEvaluator(ChatClient.builder(chatModel), 3.5f);
+
+    EvaluationResponse evaluationResponse = correctnessEvaluator.evaluate(
+    new EvaluationRequest(
+        question,
+        List.of(),
+        "Light scattering makes the sky blue."));
+
+    assertTrue(evaluationResponse.isPass(), "Response is incorrect");
+}
+----
+
+The `CorrectnessEvaluator` is created with a `ChatClient` as well as a threshold that the score must be greater than or equal to in order for the evaluation to be considered correct.