Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.springframework.ai.evaluation;

import java.util.List;

import org.springframework.ai.model.Content;

/**
* Represents an evaluation request for correctness evaluation.
*
* @author Craig Walls
* @since 1.0.0 M2
*/
public class CorrectnessEvaluationRequest extends EvaluationRequest {

private final String referenceAnswer;

public CorrectnessEvaluationRequest(String userText, List<Content> dataList, String responseContent,
String referenceAnswer) {
super(userText, dataList, responseContent);
this.referenceAnswer = referenceAnswer;
}

public String getReferenceAnswer() {
return referenceAnswer;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package org.springframework.ai.evaluation;

import java.util.Collections;

import org.springframework.ai.chat.client.ChatClient;

/**
* Evaluates the correctness of a generated answer.
*
* The evaluator relies on a reference answer to judge the correctness of the generated
* answer.
*
* The evaluation response includes a score between 1 and 5, where 1 is the worst and 5 is
* the best. The evaluator also provides reasoning for the score.
*
* Passing is determined by the score being greater than or equal to a threshold.
*
* @author Craig Walls
* @since 1.0.0 M2
*/
public class CorrectnessEvaluator implements Evaluator {

private static final String DEFAULT_REFERENCE_ANSWER = "(NO REFERENCE ANSWER SUPPLIED)";

private static final String DEFAULT_SYSTEM_PROMPT_TEXT = """
You are an expert evaluation system for a question answering chatbot.

You are given the following information:
- a user query, and
- a generated answer

You may also be given a reference answer to use for reference in your evaluation.

Your job is to judge the relevance and correctness of the generated answer.
Output a single score that represents a holistic evaluation.

Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- If the generated answer is not relevant to the user query,
you should give a score of 1.
- If the generated answer is relevant but contains mistakes,
you should give a score between 2 and 3.
- If the generated answer is relevant and fully correct,
you should give a score between 4 and 5.

Example Response:
4.0
The generated answer has the exact same metrics as the reference answer,
but it is not as concise.
""";

private static final String DEFAULT_USER_PROMPT_TEMPLATE = """
## User Query
{query}

## Reference Answer
{reference_answer}

## Generated Answer
{generated_answer}
""";

private final ChatClient.Builder chatClientBuilder;

private float scoreThreshold = 4.0f;

public CorrectnessEvaluator(ChatClient.Builder chatClientBuilder, float scoreThreshold) {
this.chatClientBuilder = chatClientBuilder;
this.scoreThreshold = scoreThreshold;
}

@Override
public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
final String referenceAnswer = (evaluationRequest instanceof CorrectnessEvaluationRequest)
? ((CorrectnessEvaluationRequest) evaluationRequest).getReferenceAnswer() : DEFAULT_REFERENCE_ANSWER;

var query = evaluationRequest.getUserText();
var generatedAnswer = evaluationRequest.getResponseContent();

CorrectnessEvaluation evaluationResult = this.chatClientBuilder.build()
.prompt()
.system(systemSpec -> systemSpec.text(DEFAULT_SYSTEM_PROMPT_TEXT))
.user(userSpec -> userSpec.text(DEFAULT_USER_PROMPT_TEMPLATE)
.param("query", query)
.param("reference_answer", referenceAnswer)
.param("generated_answer", generatedAnswer))
.call()
.entity(CorrectnessEvaluation.class);

boolean passing = evaluationResult.score() >= this.scoreThreshold;

return new EvaluationResponse(passing, evaluationResult.score(), evaluationResult.reasoning(),
Collections.emptyMap());
}

private record CorrectnessEvaluation(float score, String reasoning) {
}

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.springframework.ai.evaluation;

import org.springframework.ai.chat.model.ChatResponse;
import org.springframework.ai.model.Content;

import java.util.List;
Expand All @@ -20,12 +19,12 @@ public class EvaluationRequest {

private final List<Content> dataList;

private final ChatResponse chatResponse;
private final String responseContent;

public EvaluationRequest(String userText, List<Content> dataList, ChatResponse chatResponse) {
public EvaluationRequest(String userText, List<Content> dataList, String responseContent) {
this.userText = userText;
this.dataList = dataList;
this.chatResponse = chatResponse;
this.responseContent = responseContent;
}

public String getUserText() {
Expand All @@ -36,14 +35,14 @@ public List<Content> getDataList() {
return dataList;
}

public ChatResponse getChatResponse() {
return chatResponse;
public String getResponseContent() {
return responseContent;
}

@Override
public String toString() {
return "EvaluationRequest{" + "userText='" + userText + '\'' + ", dataList=" + dataList + ", chatResponse="
+ chatResponse + '}';
+ responseContent + '}';
}

@Override
Expand All @@ -53,12 +52,12 @@ public boolean equals(Object o) {
if (!(o instanceof EvaluationRequest that))
return false;
return Objects.equals(userText, that.userText) && Objects.equals(dataList, that.dataList)
&& Objects.equals(chatResponse, that.chatResponse);
&& Objects.equals(responseContent, that.responseContent);
}

@Override
public int hashCode() {
return Objects.hash(userText, dataList, chatResponse);
return Objects.hash(userText, dataList, responseContent);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public RelevancyEvaluator(ChatClient.Builder chatClientBuilder) {
@Override
public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {

var response = doGetResponse(evaluationRequest);
var response = evaluationRequest.getResponseContent();
var context = doGetSupportingData(evaluationRequest);

String evaluationResponse = this.chatClientBuilder.build()
Expand All @@ -52,10 +52,6 @@ public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
return new EvaluationResponse(passing, score, "", Collections.emptyMap());
}

protected String doGetResponse(EvaluationRequest evaluationRequest) {
return evaluationRequest.getChatResponse().getResult().getOutput().getContent();
}

protected String doGetSupportingData(EvaluationRequest evaluationRequest) {
List<Content> data = evaluationRequest.getDataList();
String supportingData = data.stream()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,72 @@ void testEvaluation() {
}
----

The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here].
The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here].

== CorrectnessEvaluator

Whereas `RelevancyEvaluator` establishes if the generated content is relevant to the input, `CorrectnessEvaluator` determines if the generated content is correct, as compared with a reference answer that is correct. It also produces a score (with a range of 1 to 5) the gauge how correct the generated content is.

The `CorrectnessEvaluator` submits the following system prompt to the AI model as guidelines for determining correctness:

[source,text]
----
You are an expert evaluation system for a question answering chatbot.
You are given the following information:
- a user query, and
- a generated answer
You may also be given a reference answer to use for reference in your evaluation.
Your job is to judge the relevance and correctness of the generated answer.
Output a single score that represents a holistic evaluation.
Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- If the generated answer is not relevant to the user query,
you should give a score of 1.
- If the generated answer is relevant but contains mistakes,
you should give a score between 2 and 3.
- If the generated answer is relevant and fully correct,
you should give a score between 4 and 5.
Example Response:
4.0
The generated answer has the exact same metrics as the reference answer,
but it is not as concise.
----

Along with the system prompt, the query input, generated answer, and the reference answer are provided in the user prompt:

[source,text]
----
{query}
## Reference Answer
{reference_answer}
## Generated Answer
{generated_answer}
----

Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text.

[source,java]
----
@Test
void testEvaluation() {
String userText = "Why is the sky blue?";

ChatResponse response = ChatClient.builder(chatModel)
.build().prompt()
.user(userText)
.call()
.chatResponse();

var correctnessEvaluator = new CorrectnessEvaluator(ChatClient.builder(chatModel), 3.5f);

EvaluationResponse evaluationResponse = correctnessEvaluator.evaluate(
new EvaluationRequest(
question,
List.of(),
"Light scattering makes the sky blue."));

assertTrue(evaluationResponse.isPass(), "Response is incorrect");
}
----

The `CorrectnessEvaluator` is created with a `ChatClient` as well as a threshold that the score must be greater than or equal to in order for the evaluation to be considered correct.