Skip to content

Commit 321fc52

Browse files
committed
Add correctness evaluator
1 parent 0c5455e commit 321fc52

File tree

2 files changed

+129
-0
lines changed

2 files changed

+129
-0
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package org.springframework.ai.evaluation;
2+
3+
import java.util.List;
4+
5+
import org.springframework.ai.model.Content;
6+
7+
/**
8+
* Represents an evaluation request for correctness evaluation.
9+
*
10+
* @author Craig Walls
11+
* @since 1.0.0 M1
12+
*/
13+
public class CorrectnessEvaluationRequest extends EvaluationRequest {
14+
15+
private final String referenceAnswer;
16+
17+
public CorrectnessEvaluationRequest(String userText, List<Content> dataList, String responseContent,
18+
String referenceAnswer) {
19+
super(userText, dataList, responseContent);
20+
this.referenceAnswer = referenceAnswer;
21+
}
22+
23+
public String getReferenceAnswer() {
24+
return referenceAnswer;
25+
}
26+
27+
}
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package org.springframework.ai.evaluation;
2+
3+
import java.util.Collections;
4+
5+
import org.springframework.ai.chat.client.ChatClient;
6+
7+
/**
8+
* Evaluates the correctness of a generated answer.
9+
*
10+
* The evaluator relies on a reference answer to judge the correctness of the generated
11+
* answer.
12+
*
13+
* The evaluation response includes a score between 1 and 5, where 1 is the worst and 5 is
14+
* the best. The evaluator also provides reasoning for the score.
15+
*
16+
* Passing is determined by the score being greater than or equal to a threshold.
17+
*
18+
* @author Craig Walls
19+
* @since 1.0.0 M1
20+
*/
21+
public class CorrectnessEvaluator implements Evaluator {
22+
23+
private static final String DEFAULT_REFERENCE_ANSWER = "(NO REFERENCE ANSWER SUPPLIED)";
24+
25+
private static final String DEFAULT_SYSTEM_PROMPT_TEXT = """
26+
You are an expert evaluation system for a question answering chatbot.
27+
28+
You are given the following information:
29+
- a user query, and
30+
- a generated answer
31+
32+
You may also be given a reference answer to use for reference in your evaluation.
33+
34+
Your job is to judge the relevance and correctness of the generated answer.
35+
Output a single score that represents a holistic evaluation.
36+
You must return your response in a line with only the score.
37+
Do not return answers in any other format.
38+
On a separate line provide your reasoning for the score as well.
39+
40+
Follow these guidelines for scoring:
41+
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
42+
- If the generated answer is not relevant to the user query,
43+
you should give a score of 1.
44+
- If the generated answer is relevant but contains mistakes,
45+
you should give a score between 2 and 3.
46+
- If the generated answer is relevant and fully correct,
47+
you should give a score between 4 and 5.
48+
49+
Example Response:
50+
4.0
51+
The generated answer has the exact same metrics as the reference answer,
52+
but it is not as concise.
53+
""";
54+
55+
private static final String DEFAULT_USER_PROMPT_TEMPLATE = """
56+
## User Query
57+
{query}
58+
59+
## Reference Answer
60+
{reference_answer}
61+
62+
## Generated Answer
63+
{generated_answer}
64+
""";
65+
66+
private final ChatClient.Builder chatClientBuilder;
67+
68+
private float scoreThreshold = 4.0f;
69+
70+
public CorrectnessEvaluator(ChatClient.Builder chatClientBuilder, float scoreThreshold) {
71+
this.chatClientBuilder = chatClientBuilder;
72+
this.scoreThreshold = scoreThreshold;
73+
}
74+
75+
@Override
76+
public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
77+
final String referenceAnswer = (evaluationRequest instanceof CorrectnessEvaluationRequest)
78+
? ((CorrectnessEvaluationRequest) evaluationRequest).getReferenceAnswer() : DEFAULT_REFERENCE_ANSWER;
79+
80+
var query = evaluationRequest.getUserText();
81+
var generatedAnswer = evaluationRequest.getResponseContent();
82+
83+
CorrectnessEvaluation evaluationResult = this.chatClientBuilder.build()
84+
.prompt()
85+
.system(systemSpec -> systemSpec.text(DEFAULT_SYSTEM_PROMPT_TEXT))
86+
.user(userSpec -> userSpec.text(DEFAULT_USER_PROMPT_TEMPLATE)
87+
.param("query", query)
88+
.param("reference_answer", referenceAnswer)
89+
.param("generated_answer", generatedAnswer))
90+
.call()
91+
.entity(CorrectnessEvaluation.class);
92+
93+
boolean passing = evaluationResult.score() >= this.scoreThreshold;
94+
95+
return new EvaluationResponse(passing, evaluationResult.score(), evaluationResult.reasoning(),
96+
Collections.emptyMap());
97+
}
98+
99+
private record CorrectnessEvaluation(float score, String reasoning) {
100+
}
101+
102+
}

0 commit comments

Comments
 (0)