diff --git a/spring-ai-client-chat/src/main/java/org/springframework/ai/chat/evaluation/RelevancyEvaluator.java b/spring-ai-client-chat/src/main/java/org/springframework/ai/chat/evaluation/RelevancyEvaluator.java index 5ae2c43933a..62f7ff7c8e6 100644 --- a/spring-ai-client-chat/src/main/java/org/springframework/ai/chat/evaluation/RelevancyEvaluator.java +++ b/spring-ai-client-chat/src/main/java/org/springframework/ai/chat/evaluation/RelevancyEvaluator.java @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 the original author or authors. + * Copyright 2023-2025 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,51 +16,70 @@ package org.springframework.ai.chat.evaluation; -import java.util.Collections; - import org.springframework.ai.chat.client.ChatClient; +import org.springframework.ai.chat.prompt.PromptTemplate; import org.springframework.ai.evaluation.EvaluationRequest; import org.springframework.ai.evaluation.EvaluationResponse; import org.springframework.ai.evaluation.Evaluator; +import org.springframework.lang.Nullable; +import org.springframework.util.Assert; + +import java.util.Collections; +import java.util.Map; +/** + * Evaluates the relevancy of a response to a query based on the context provided. + */ public class RelevancyEvaluator implements Evaluator { - private static final String DEFAULT_EVALUATION_PROMPT_TEXT = """ + private static final PromptTemplate DEFAULT_PROMPT_TEMPLATE = new PromptTemplate(""" Your task is to evaluate if the response for the query - is in line with the context information provided.\\n - You have two options to answer. Either YES/ NO.\\n - Answer - YES, if the response for the query - is in line with context information otherwise NO.\\n - Query: \\n {query}\\n - Response: \\n {response}\\n - Context: \\n {context}\\n - Answer: " - """; + is in line with the context information provided. + + You have two options to answer. Either YES or NO. + + Answer YES, if the response for the query + is in line with context information otherwise NO. + + Query: + {query} + + Response: + {response} + + Context: + {context} + + Answer: + """); private final ChatClient.Builder chatClientBuilder; + private final PromptTemplate promptTemplate; + public RelevancyEvaluator(ChatClient.Builder chatClientBuilder) { + this(chatClientBuilder, null); + } + + private RelevancyEvaluator(ChatClient.Builder chatClientBuilder, @Nullable PromptTemplate promptTemplate) { + Assert.notNull(chatClientBuilder, "chatClientBuilder cannot be null"); this.chatClientBuilder = chatClientBuilder; + this.promptTemplate = promptTemplate != null ? promptTemplate : DEFAULT_PROMPT_TEMPLATE; } @Override public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) { - var response = evaluationRequest.getResponseContent(); var context = doGetSupportingData(evaluationRequest); - String evaluationResponse = this.chatClientBuilder.build() - .prompt() - .user(userSpec -> userSpec.text(DEFAULT_EVALUATION_PROMPT_TEXT) - .param("query", evaluationRequest.getUserText()) - .param("response", response) - .param("context", context)) - .call() - .content(); + var userMessage = this.promptTemplate + .render(Map.of("query", evaluationRequest.getUserText(), "response", response, "context", context)); + + String evaluationResponse = this.chatClientBuilder.build().prompt().user(userMessage).call().content(); boolean passing = false; float score = 0; - if (evaluationResponse.toLowerCase().contains("yes")) { + if (evaluationResponse != null && evaluationResponse.toLowerCase().contains("yes")) { passing = true; score = 1; } @@ -68,4 +87,33 @@ public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) { return new EvaluationResponse(passing, score, "", Collections.emptyMap()); } + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private ChatClient.Builder chatClientBuilder; + + private PromptTemplate promptTemplate; + + private Builder() { + } + + public Builder chatClientBuilder(ChatClient.Builder chatClientBuilder) { + this.chatClientBuilder = chatClientBuilder; + return this; + } + + public Builder promptTemplate(PromptTemplate promptTemplate) { + this.promptTemplate = promptTemplate; + return this; + } + + public RelevancyEvaluator build() { + return new RelevancyEvaluator(this.chatClientBuilder, this.promptTemplate); + } + + } + } diff --git a/spring-ai-client-chat/src/test/java/org/springframework/ai/chat/evaluation/RelevancyEvaluatorTests.java b/spring-ai-client-chat/src/test/java/org/springframework/ai/chat/evaluation/RelevancyEvaluatorTests.java new file mode 100644 index 00000000000..f13260d7fd3 --- /dev/null +++ b/spring-ai-client-chat/src/test/java/org/springframework/ai/chat/evaluation/RelevancyEvaluatorTests.java @@ -0,0 +1,56 @@ +/* + * Copyright 2023-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.springframework.ai.chat.evaluation; + +import org.junit.jupiter.api.Test; +import org.springframework.ai.chat.client.ChatClient; +import org.springframework.ai.chat.model.ChatModel; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.Mockito.mock; + +/** + * Unit tests for {@link RelevancyEvaluator}. + * + * @author Thomas Vitale + */ +class RelevancyEvaluatorTests { + + @Test + void whenChatClientBuilderIsNullThenThrow() { + assertThatThrownBy(() -> new RelevancyEvaluator(null)).isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("chatClientBuilder cannot be null"); + + assertThatThrownBy(() -> RelevancyEvaluator.builder().chatClientBuilder(null).build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("chatClientBuilder cannot be null"); + } + + @Test + void whenPromptTemplateIsNullThenUseDefault() { + RelevancyEvaluator evaluator = new RelevancyEvaluator(ChatClient.builder(mock(ChatModel.class))); + assertThat(evaluator).isNotNull(); + + evaluator = RelevancyEvaluator.builder() + .chatClientBuilder(ChatClient.builder(mock(ChatModel.class))) + .promptTemplate(null) + .build(); + assertThat(evaluator).isNotNull(); + } + +} diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc index 79ad8ca135d..2b7910018cd 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/testing.adoc @@ -6,8 +6,6 @@ One method to evaluate the response is to use the AI model itself for evaluation The Spring AI interface for evaluating responses is `Evaluator`, defined as: - - [source,java] ---- @FunctionalInterface @@ -42,58 +40,88 @@ public class EvaluationRequest { * `dataList`: Contextual data, such as from Retrieval Augmented Generation, appended to the raw input. * `responseContent`: The AI model's response content as a `String` -== RelevancyEvaluator +== Relevancy Evaluator -One implementation is the `RelevancyEvaluator`, which uses the AI model for evaluation. More implementations will be available in future releases. +The `RelevancyEvaluator` is an implementation of the `Evaluator` interface, designed to assess the relevance of AI-generated responses against provided context. This evaluator helps assess the quality of a RAG flow by determining if the AI model's response is relevant to the user's input with respect to the retrieved context. -The `RelevancyEvaluator` uses the input (`userText`) and the AI model's output (`chatResponse`) to ask the question: +The evaluation is based on the user input, the AI model's response, and the context information. It uses a prompt template to ask the AI model if the response is relevant to the user input and context. -[source, text] +This is the default prompt template used by the `RelevancyEvaluator`: + +[source,text] ---- Your task is to evaluate if the response for the query -is in line with the context information provided.\n -You have two options to answer. Either YES/ NO.\n -Answer - YES, if the response for the query -is in line with context information otherwise NO.\n -Query: \n {query}\n -Response: \n {response}\n -Context: \n {context}\n -Answer: " ----- +is in line with the context information provided. -Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text. +You have two options to answer. Either YES or NO. -[source,java] ----- -@Test -void testEvaluation() { +Answer YES, if the response for the query +is in line with context information otherwise NO. - dataController.delete(); - dataController.load(); +Query: +{query} - String userText = "What is the purpose of Carina?"; +Response: +{response} - ChatResponse response = ChatClient.builder(chatModel) - .build().prompt() - .advisors(new QuestionAnswerAdvisor(vectorStore)) - .user(userText) - .call() - .chatResponse(); - String responseContent = response.getResult().getOutput().getContent(); +Context: +{context} - var relevancyEvaluator = new RelevancyEvaluator(ChatClient.builder(chatModel)); +Answer: +---- - EvaluationRequest evaluationRequest = new EvaluationRequest(userText, - (List) response.getMetadata().get(QuestionAnswerAdvisor.RETRIEVED_DOCUMENTS), responseContent); +NOTE: You can customize the prompt template by providing your own `PromptTemplate` object via the `.promptTemplate()` builder method. See xref:_custom_template[Custom Template] for details. - EvaluationResponse evaluationResponse = relevancyEvaluator.evaluate(evaluationRequest); +== Usage in Integration Tests - assertTrue(evaluationResponse.isPass(), "Response is not relevant to the question"); +Here is an example of usage of the `RelevancyEvaluator` in an integration test, validating the result of a RAG flow using the `RetrievalAugmentationAdvisor`: +[source,java] +---- +@Test +void evaluateRelevancy() { + String question = "Where does the adventure of Anacletus and Birba take place?"; + + RetrievalAugmentationAdvisor ragAdvisor = RetrievalAugmentationAdvisor.builder() + .documentRetriever(VectorStoreDocumentRetriever.builder() + .vectorStore(pgVectorStore) + .build()) + .build(); + + ChatResponse chatResponse = ChatClient.builder(chatModel).build() + .prompt(question) + .advisors(ragAdvisor) + .call() + .chatResponse(); + + EvaluationRequest evaluationRequest = new EvaluationRequest( + // The original user question + question, + // The retrieved context from the RAG flow + chatResponse.getMetadata().get(RetrievalAugmentationAdvisor.DOCUMENT_CONTEXT), + // The AI model's response + chatResponse.getResult().getOutput().getText() + ); + + RelevancyEvaluator evaluator = new RelevancyEvaluator(ChatClient.builder(chatModel)); + + EvaluationResponse evaluationResponse = evaluator.evaluate(evaluationRequest); + + assertThat(evaluationResponse.isPass()).isTrue(); } ---- -The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here]. +You can find several integration tests in the Spring AI project that use the `RelevancyEvaluator` to test the functionality of the `QuestionAnswerAdvisor` (see https://github.com/spring-projects/spring-ai/blob/main/spring-ai-integration-tests/src/test/java/org/springframework/ai/integration/tests/client/advisor/QuestionAnswerAdvisorIT.java[tests]) and `RetrievalAugmentationAdvisor` (see https://github.com/spring-projects/spring-ai/blob/main/spring-ai-integration-tests/src/test/java/org/springframework/ai/integration/tests/client/advisor/RetrievalAugmentationAdvisorIT.java[tests]). + +=== Custom Template + +The `RelevancyEvaluator` uses a default template to prompt the AI model for evaluation. You can customize this behavior by providing your own `PromptTemplate` object via the `.promptTemplate()` builder method. + +The custom `PromptTemplate` can use any `TemplateRenderer` implementation (by default, it uses `StPromptTemplate` based on the https://www.stringtemplate.org/[StringTemplate] engine). The important requirement is that the template must contain the following placeholders: + +* a `query` placeholder to receive the user question. +* a `response` placeholder to receive the AI model's response. +* a `context` placeholder to receive the context information. == FactCheckingEvaluator