spring-projects · habuma · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jul 1, 2024
diff --git a/...ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java b/...ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluationRequest.java
@@ -0,0 +1,27 @@
+package org.springframework.ai.evaluation;
+
+import java.util.List;
+
+import org.springframework.ai.model.Content;
+
+/**
+ * Represents an evaluation request for correctness evaluation.
+ *
+ * @author Craig Walls
+ * @since 1.0.0 M2
+ */
+public class CorrectnessEvaluationRequest extends EvaluationRequest {
+
+	private final String referenceAnswer;
+
+	public CorrectnessEvaluationRequest(String userText, List<Content> dataList, String responseContent,
+			String referenceAnswer) {
+		super(userText, dataList, responseContent);
+		this.referenceAnswer = referenceAnswer;
+	}
+
+	public String getReferenceAnswer() {
+		return referenceAnswer;
+	}
+
+}
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/CorrectnessEvaluator.java
@@ -0,0 +1,99 @@
+package org.springframework.ai.evaluation;
+
+import java.util.Collections;
+
+import org.springframework.ai.chat.client.ChatClient;
+
+/**
+ * Evaluates the correctness of a generated answer.
+ *
+ * The evaluator relies on a reference answer to judge the correctness of the generated
+ * answer.
+ *
+ * The evaluation response includes a score between 1 and 5, where 1 is the worst and 5 is
+ * the best. The evaluator also provides reasoning for the score.
+ *
+ * Passing is determined by the score being greater than or equal to a threshold.
+ *
+ * @author Craig Walls
+ * @since 1.0.0 M2
+ */
+public class CorrectnessEvaluator implements Evaluator {
+
+	private static final String DEFAULT_REFERENCE_ANSWER = "(NO REFERENCE ANSWER SUPPLIED)";
+
+	private static final String DEFAULT_SYSTEM_PROMPT_TEXT = """
+			    You are an expert evaluation system for a question answering chatbot.
+
+			    You are given the following information:
+			    - a user query, and
+			    - a generated answer
+
+			    You may also be given a reference answer to use for reference in your evaluation.
+
+			    Your job is to judge the relevance and correctness of the generated answer.
+			    Output a single score that represents a holistic evaluation.
+
+			    Follow these guidelines for scoring:
+			    - Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
+			    - If the generated answer is not relevant to the user query,
+			    you should give a score of 1.
+			    - If the generated answer is relevant but contains mistakes,
+			    you should give a score between 2 and 3.
+			    - If the generated answer is relevant and fully correct,
+			    you should give a score between 4 and 5.
+
+			    Example Response:
+			    4.0
+			    The generated answer has the exact same metrics as the reference answer,
+			    but it is not as concise.
+			""";
+
+	private static final String DEFAULT_USER_PROMPT_TEMPLATE = """
+			    ## User Query
+			    {query}
+
+			    ## Reference Answer
+			    {reference_answer}
+
+			    ## Generated Answer
+			    {generated_answer}
+			""";
+
+	private final ChatClient.Builder chatClientBuilder;
+
+	private float scoreThreshold = 4.0f;
+
+	public CorrectnessEvaluator(ChatClient.Builder chatClientBuilder, float scoreThreshold) {
+		this.chatClientBuilder = chatClientBuilder;
+		this.scoreThreshold = scoreThreshold;
+	}
+
+	@Override
+	public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
+		final String referenceAnswer = (evaluationRequest instanceof CorrectnessEvaluationRequest)
+				? ((CorrectnessEvaluationRequest) evaluationRequest).getReferenceAnswer() : DEFAULT_REFERENCE_ANSWER;
+
+		var query = evaluationRequest.getUserText();
+		var generatedAnswer = evaluationRequest.getResponseContent();
+
+		CorrectnessEvaluation evaluationResult = this.chatClientBuilder.build()
+			.prompt()
+			.system(systemSpec -> systemSpec.text(DEFAULT_SYSTEM_PROMPT_TEXT))
+			.user(userSpec -> userSpec.text(DEFAULT_USER_PROMPT_TEMPLATE)
+				.param("query", query)
+				.param("reference_answer", referenceAnswer)
+				.param("generated_answer", generatedAnswer))
+			.call()
+			.entity(CorrectnessEvaluation.class);
+
+		boolean passing = evaluationResult.score() >= this.scoreThreshold;
+
+		return new EvaluationResponse(passing, evaluationResult.score(), evaluationResult.reasoning(),
+				Collections.emptyMap());
+	}
+
+	private record CorrectnessEvaluation(float score, String reasoning) {
+	}
+
+}
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationRequest.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationRequest.java
@@ -1,6 +1,5 @@
 package org.springframework.ai.evaluation;
 
-import org.springframework.ai.chat.model.ChatResponse;
 import org.springframework.ai.model.Content;
 
 import java.util.List;
@@ -20,12 +19,12 @@ public class EvaluationRequest {
 
 	private final List<Content> dataList;
 
-	private final ChatResponse chatResponse;
+	private final String responseContent;
 
-	public EvaluationRequest(String userText, List<Content> dataList, ChatResponse chatResponse) {
+	public EvaluationRequest(String userText, List<Content> dataList, String responseContent) {
 		this.userText = userText;
 		this.dataList = dataList;
-		this.chatResponse = chatResponse;
+		this.responseContent = responseContent;
 	}
 
 	public String getUserText() {
@@ -36,14 +35,14 @@ public List<Content> getDataList() {
 		return dataList;
 	}
 
-	public ChatResponse getChatResponse() {
-		return chatResponse;
+	public String getResponseContent() {
+		return responseContent;
 	}
 
 	@Override
 	public String toString() {
 		return "EvaluationRequest{" + "userText='" + userText + '\'' + ", dataList=" + dataList + ", chatResponse="
-				+ chatResponse + '}';
+				+ responseContent + '}';
 	}
 
 	@Override
@@ -53,12 +52,12 @@ public boolean equals(Object o) {
 		if (!(o instanceof EvaluationRequest that))
 			return false;
 		return Objects.equals(userText, that.userText) && Objects.equals(dataList, that.dataList)
-				&& Objects.equals(chatResponse, that.chatResponse);
+				&& Objects.equals(responseContent, that.responseContent);
 	}
 
 	@Override
 	public int hashCode() {
-		return Objects.hash(userText, dataList, chatResponse);
+		return Objects.hash(userText, dataList, responseContent);
 	}
 
 }
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/RelevancyEvaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/RelevancyEvaluator.java
@@ -30,7 +30,7 @@ public RelevancyEvaluator(ChatClient.Builder chatClientBuilder) {
 	@Override
 	public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
 
-		var response = doGetResponse(evaluationRequest);
+		var response = evaluationRequest.getResponseContent();
 		var context = doGetSupportingData(evaluationRequest);
 
 		String evaluationResponse = this.chatClientBuilder.build()
@@ -52,10 +52,6 @@ public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
 		return new EvaluationResponse(passing, score, "", Collections.emptyMap());
 	}
 
-	protected String doGetResponse(EvaluationRequest evaluationRequest) {
-		return evaluationRequest.getChatResponse().getResult().getOutput().getContent();
-	}
-
 	protected String doGetSupportingData(EvaluationRequest evaluationRequest) {
 		List<Content> data = evaluationRequest.getDataList();
 		String supportingData = data.stream()

diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc
@@ -92,4 +92,72 @@ void testEvaluation() {
 }
 ----
 
-The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here].
+The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here].
+
+== CorrectnessEvaluator
+
+Whereas `RelevancyEvaluator` establishes if the generated content is relevant to the input, `CorrectnessEvaluator` determines if the generated content is correct, as compared with a reference answer that is correct. It also produces a score (with a range of 1 to 5) the gauge how correct the generated content is.
+
+The `CorrectnessEvaluator` submits the following system prompt to the AI model as guidelines for determining correctness:
+
+[source,text]
+----
+You are an expert evaluation system for a question answering chatbot.
+You are given the following information:
+- a user query, and
+- a generated answer
+You may also be given a reference answer to use for reference in your evaluation.
+Your job is to judge the relevance and correctness of the generated answer.
+Output a single score that represents a holistic evaluation.
+Follow these guidelines for scoring:
+- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
+- If the generated answer is not relevant to the user query,
+you should give a score of 1.
+- If the generated answer is relevant but contains mistakes,
+you should give a score between 2 and 3.
+- If the generated answer is relevant and fully correct,
+you should give a score between 4 and 5.
+Example Response:
+4.0
+The generated answer has the exact same metrics as the reference answer,
+but it is not as concise.
+----
+
+Along with the system prompt, the query input, generated answer, and the reference answer are provided in the user prompt:
+
+[source,text]
+----
+{query}
+## Reference Answer
+{reference_answer}
+## Generated Answer
+{generated_answer}
+----
+
+Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text.
+
+[source,java]
+----
+@Test
+void testEvaluation() {
+    String userText = "Why is the sky blue?";
+
+    ChatResponse response = ChatClient.builder(chatModel)
+            .build().prompt()
+            .user(userText)
+            .call()
+            .chatResponse();
+
+    var correctnessEvaluator = new CorrectnessEvaluator(ChatClient.builder(chatModel), 3.5f);
+
+    EvaluationResponse evaluationResponse = correctnessEvaluator.evaluate(
+    new EvaluationRequest(
+        question,
+        List.of(),
+        "Light scattering makes the sky blue."));
+
+    assertTrue(evaluationResponse.isPass(), "Response is incorrect");
+}
+----
+
+The `CorrectnessEvaluator` is created with a `ChatClient` as well as a threshold that the score must be greater than or equal to in order for the evaluation to be considered correct.