diff --git a/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockChatOptions.java b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockChatOptions.java index ba0eceb2a62..54fcd367df6 100644 --- a/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockChatOptions.java +++ b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockChatOptions.java @@ -29,6 +29,7 @@ import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonProperty; +import org.springframework.ai.bedrock.converse.api.BedrockCacheOptions; import org.springframework.ai.model.tool.ToolCallingChatOptions; import org.springframework.ai.tool.ToolCallback; import org.springframework.lang.Nullable; @@ -81,6 +82,9 @@ public class BedrockChatOptions implements ToolCallingChatOptions { @JsonIgnore private Boolean internalToolExecutionEnabled; + @JsonIgnore + private BedrockCacheOptions cacheOptions; + public static Builder builder() { return new Builder(); } @@ -101,6 +105,7 @@ public static BedrockChatOptions fromOptions(BedrockChatOptions fromOptions) { .toolNames(new HashSet<>(fromOptions.getToolNames())) .toolContext(new HashMap<>(fromOptions.getToolContext())) .internalToolExecutionEnabled(fromOptions.getInternalToolExecutionEnabled()) + .cacheOptions(fromOptions.getCacheOptions()) .build(); } @@ -237,6 +242,16 @@ public void setInternalToolExecutionEnabled(@Nullable Boolean internalToolExecut this.internalToolExecutionEnabled = internalToolExecutionEnabled; } + @JsonIgnore + public BedrockCacheOptions getCacheOptions() { + return this.cacheOptions; + } + + @JsonIgnore + public void setCacheOptions(BedrockCacheOptions cacheOptions) { + this.cacheOptions = cacheOptions; + } + @Override @SuppressWarnings("unchecked") public BedrockChatOptions copy() { @@ -259,14 +274,15 @@ public boolean equals(Object o) { && Objects.equals(this.temperature, that.temperature) && Objects.equals(this.topK, that.topK) && Objects.equals(this.topP, that.topP) && Objects.equals(this.toolCallbacks, that.toolCallbacks) && Objects.equals(this.toolNames, that.toolNames) && Objects.equals(this.toolContext, that.toolContext) - && Objects.equals(this.internalToolExecutionEnabled, that.internalToolExecutionEnabled); + && Objects.equals(this.internalToolExecutionEnabled, that.internalToolExecutionEnabled) + && Objects.equals(this.cacheOptions, that.cacheOptions); } @Override public int hashCode() { return Objects.hash(this.model, this.frequencyPenalty, this.maxTokens, this.presencePenalty, this.requestParameters, this.stopSequences, this.temperature, this.topK, this.topP, this.toolCallbacks, - this.toolNames, this.toolContext, this.internalToolExecutionEnabled); + this.toolNames, this.toolContext, this.internalToolExecutionEnabled, this.cacheOptions); } public static final class Builder { @@ -356,6 +372,11 @@ public Builder internalToolExecutionEnabled(@Nullable Boolean internalToolExecut return this; } + public Builder cacheOptions(BedrockCacheOptions cacheOptions) { + this.options.setCacheOptions(cacheOptions); + return this; + } + public BedrockChatOptions build() { return this.options; } diff --git a/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModel.java b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModel.java index 3c4e287ec2c..adc54cf132d 100644 --- a/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModel.java +++ b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModel.java @@ -24,6 +24,7 @@ import java.time.Duration; import java.util.ArrayList; import java.util.Base64; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -46,6 +47,7 @@ import software.amazon.awssdk.regions.providers.DefaultAwsRegionProviderChain; import software.amazon.awssdk.services.bedrockruntime.BedrockRuntimeAsyncClient; import software.amazon.awssdk.services.bedrockruntime.BedrockRuntimeClient; +import software.amazon.awssdk.services.bedrockruntime.model.CachePointBlock; import software.amazon.awssdk.services.bedrockruntime.model.ContentBlock; import software.amazon.awssdk.services.bedrockruntime.model.ConversationRole; import software.amazon.awssdk.services.bedrockruntime.model.ConverseMetrics; @@ -74,6 +76,8 @@ import software.amazon.awssdk.services.bedrockruntime.model.VideoFormat; import software.amazon.awssdk.services.bedrockruntime.model.VideoSource; +import org.springframework.ai.bedrock.converse.api.BedrockCacheOptions; +import org.springframework.ai.bedrock.converse.api.BedrockCacheStrategy; import org.springframework.ai.bedrock.converse.api.BedrockMediaFormat; import org.springframework.ai.bedrock.converse.api.ConverseApiUtils; import org.springframework.ai.bedrock.converse.api.URLValidator; @@ -316,6 +320,8 @@ else if (prompt.getOptions() instanceof ToolCallingChatOptions toolCallingChatOp .internalToolExecutionEnabled(runtimeOptions.getInternalToolExecutionEnabled() != null ? runtimeOptions.getInternalToolExecutionEnabled() : this.defaultOptions.getInternalToolExecutionEnabled()) + .cacheOptions(runtimeOptions.getCacheOptions() != null ? runtimeOptions.getCacheOptions() + : this.defaultOptions.getCacheOptions()) .build(); } @@ -326,85 +332,164 @@ else if (prompt.getOptions() instanceof ToolCallingChatOptions toolCallingChatOp ConverseRequest createRequest(Prompt prompt) { - List instructionMessages = prompt.getInstructions() + BedrockChatOptions updatedRuntimeOptions = prompt.getOptions().copy(); + + // Get cache options to determine strategy + BedrockCacheOptions cacheOptions = updatedRuntimeOptions.getCacheOptions(); + boolean shouldCacheConversationHistory = cacheOptions != null + && cacheOptions.getStrategy() == BedrockCacheStrategy.CONVERSATION_HISTORY; + + // Get all non-system messages + List allNonSystemMessages = prompt.getInstructions() .stream() .filter(message -> message.getMessageType() != MessageType.SYSTEM) - .map(message -> { - if (message.getMessageType() == MessageType.USER) { - List contents = new ArrayList<>(); - if (message instanceof UserMessage userMessage) { - contents.add(ContentBlock.fromText(userMessage.getText())); - - if (!CollectionUtils.isEmpty(userMessage.getMedia())) { - List mediaContent = userMessage.getMedia() - .stream() - .map(this::mapMediaToContentBlock) - .toList(); - contents.addAll(mediaContent); - } - } - return Message.builder().content(contents).role(ConversationRole.USER).build(); + .toList(); + + // Find the last user message index for CONVERSATION_HISTORY caching + int lastUserMessageIndex = -1; + if (shouldCacheConversationHistory) { + for (int i = allNonSystemMessages.size() - 1; i >= 0; i--) { + if (allNonSystemMessages.get(i).getMessageType() == MessageType.USER) { + lastUserMessageIndex = i; + break; } - else if (message.getMessageType() == MessageType.ASSISTANT) { - AssistantMessage assistantMessage = (AssistantMessage) message; - List contentBlocks = new ArrayList<>(); - if (StringUtils.hasText(message.getText())) { - contentBlocks.add(ContentBlock.fromText(message.getText())); + } + if (logger.isDebugEnabled()) { + logger.debug("CONVERSATION_HISTORY caching: lastUserMessageIndex={}, totalMessages={}", + lastUserMessageIndex, allNonSystemMessages.size()); + } + } + + // Build instruction messages with potential caching + List instructionMessages = new ArrayList<>(); + for (int i = 0; i < allNonSystemMessages.size(); i++) { + org.springframework.ai.chat.messages.Message message = allNonSystemMessages.get(i); + + // Determine if this message should have a cache point + // For CONVERSATION_HISTORY: cache point goes on the last user message + boolean shouldApplyCachePoint = shouldCacheConversationHistory && i == lastUserMessageIndex; + + if (message.getMessageType() == MessageType.USER) { + List contents = new ArrayList<>(); + if (message instanceof UserMessage) { + var userMessage = (UserMessage) message; + contents.add(ContentBlock.fromText(userMessage.getText())); + + if (!CollectionUtils.isEmpty(userMessage.getMedia())) { + List mediaContent = userMessage.getMedia() + .stream() + .map(this::mapMediaToContentBlock) + .toList(); + contents.addAll(mediaContent); } - if (!CollectionUtils.isEmpty(assistantMessage.getToolCalls())) { - for (AssistantMessage.ToolCall toolCall : assistantMessage.getToolCalls()) { + } - var argumentsDocument = ConverseApiUtils - .convertObjectToDocument(ModelOptionsUtils.jsonToMap(toolCall.arguments())); + // Apply cache point if this is the last user message + if (shouldApplyCachePoint) { + CachePointBlock cachePoint = CachePointBlock.builder().type("default").build(); + contents.add(ContentBlock.fromCachePoint(cachePoint)); + logger.debug("Applied cache point on last user message (conversation history caching)"); + } + + instructionMessages.add(Message.builder().content(contents).role(ConversationRole.USER).build()); + } + else if (message.getMessageType() == MessageType.ASSISTANT) { + AssistantMessage assistantMessage = (AssistantMessage) message; + List contentBlocks = new ArrayList<>(); + if (StringUtils.hasText(message.getText())) { + contentBlocks.add(ContentBlock.fromText(message.getText())); + } + if (!CollectionUtils.isEmpty(assistantMessage.getToolCalls())) { + for (AssistantMessage.ToolCall toolCall : assistantMessage.getToolCalls()) { - contentBlocks.add(ContentBlock.fromToolUse(ToolUseBlock.builder() - .toolUseId(toolCall.id()) - .name(toolCall.name()) - .input(argumentsDocument) - .build())); + var argumentsDocument = ConverseApiUtils + .convertObjectToDocument(ModelOptionsUtils.jsonToMap(toolCall.arguments())); + + contentBlocks.add(ContentBlock.fromToolUse(ToolUseBlock.builder() + .toolUseId(toolCall.id()) + .name(toolCall.name()) + .input(argumentsDocument) + .build())); - } } - return Message.builder().content(contentBlocks).role(ConversationRole.ASSISTANT).build(); } - else if (message.getMessageType() == MessageType.TOOL) { - List contentBlocks = ((ToolResponseMessage) message).getResponses() - .stream() - .map(toolResponse -> { + + instructionMessages + .add(Message.builder().content(contentBlocks).role(ConversationRole.ASSISTANT).build()); + } + else if (message.getMessageType() == MessageType.TOOL) { + List contentBlocks = new ArrayList<>( + ((ToolResponseMessage) message).getResponses().stream().map(toolResponse -> { ToolResultBlock toolResultBlock = ToolResultBlock.builder() .toolUseId(toolResponse.id()) .content(ToolResultContentBlock.builder().text(toolResponse.responseData()).build()) .build(); return ContentBlock.fromToolResult(toolResultBlock); - }) - .toList(); - return Message.builder().content(contentBlocks).role(ConversationRole.USER).build(); - } - else { - throw new IllegalArgumentException("Unsupported message type: " + message.getMessageType()); - } - }) - .toList(); + }).toList()); + + instructionMessages.add(Message.builder().content(contentBlocks).role(ConversationRole.USER).build()); + } + else { + throw new IllegalArgumentException("Unsupported message type: " + message.getMessageType()); + } + } + + // Determine if system message caching should be applied + boolean shouldCacheSystem = cacheOptions != null + && (cacheOptions.getStrategy() == BedrockCacheStrategy.SYSTEM_ONLY + || cacheOptions.getStrategy() == BedrockCacheStrategy.SYSTEM_AND_TOOLS); + + if (logger.isDebugEnabled() && cacheOptions != null) { + logger.debug("Cache strategy: {}, shouldCacheSystem: {}", cacheOptions.getStrategy(), shouldCacheSystem); + } - List systemMessages = prompt.getInstructions() + // Build system messages with optional caching on last message + List systemMessageList = prompt.getInstructions() .stream() .filter(m -> m.getMessageType() == MessageType.SYSTEM) - .map(sysMessage -> SystemContentBlock.builder().text(sysMessage.getText()).build()) .toList(); - BedrockChatOptions updatedRuntimeOptions = prompt.getOptions().copy(); + List systemMessages = new ArrayList<>(); + for (int i = 0; i < systemMessageList.size(); i++) { + org.springframework.ai.chat.messages.Message sysMessage = systemMessageList.get(i); + + // Add the text content block + SystemContentBlock textBlock = SystemContentBlock.builder().text(sysMessage.getText()).build(); + systemMessages.add(textBlock); + + // Apply cache point marker after last system message if caching is enabled + // SystemContentBlock is a UNION type - text and cachePoint must be separate + // blocks + boolean isLastSystem = (i == systemMessageList.size() - 1); + if (isLastSystem && shouldCacheSystem) { + CachePointBlock cachePoint = CachePointBlock.builder().type("default").build(); + SystemContentBlock cachePointBlock = SystemContentBlock.builder().cachePoint(cachePoint).build(); + systemMessages.add(cachePointBlock); + logger.debug("Applied cache point after system message"); + } + } ToolConfiguration toolConfiguration = null; // Add the tool definitions to the request's tools parameter. List toolDefinitions = this.toolCallingManager.resolveToolDefinitions(updatedRuntimeOptions); + // Determine if tool caching should be applied + boolean shouldCacheTools = cacheOptions != null + && (cacheOptions.getStrategy() == BedrockCacheStrategy.TOOLS_ONLY + || cacheOptions.getStrategy() == BedrockCacheStrategy.SYSTEM_AND_TOOLS); + if (!CollectionUtils.isEmpty(toolDefinitions)) { - List bedrockTools = toolDefinitions.stream().map(toolDefinition -> { + List bedrockTools = new ArrayList<>(); + + for (int i = 0; i < toolDefinitions.size(); i++) { + ToolDefinition toolDefinition = toolDefinitions.get(i); var description = toolDefinition.description(); var name = toolDefinition.name(); String inputSchema = toolDefinition.inputSchema(); - return Tool.builder() + + // Create tool specification + Tool tool = Tool.builder() .toolSpec(ToolSpecification.builder() .name(name) .description(description) @@ -412,7 +497,18 @@ else if (message.getMessageType() == MessageType.TOOL) { ConverseApiUtils.convertObjectToDocument(ModelOptionsUtils.jsonToMap(inputSchema)))) .build()) .build(); - }).toList(); + bedrockTools.add(tool); + + // Apply cache point marker after last tool if caching is enabled + // Tool is a UNION type - toolSpec and cachePoint must be separate objects + boolean isLastTool = (i == toolDefinitions.size() - 1); + if (isLastTool && shouldCacheTools) { + CachePointBlock cachePoint = CachePointBlock.builder().type("default").build(); + Tool cachePointTool = Tool.builder().cachePoint(cachePoint).build(); + bedrockTools.add(cachePointTool); + logger.debug("Applied cache point after tool definitions"); + } + } toolConfiguration = ToolConfiguration.builder().tools(bedrockTools).build(); } @@ -635,12 +731,23 @@ private ChatResponse toChatResponse(ConverseResponse response, ChatResponse perv ConverseMetrics metrics = response.metrics(); - var chatResponseMetaData = ChatResponseMetadata.builder() + var metadataBuilder = ChatResponseMetadata.builder() .id(response.responseMetadata() != null ? response.responseMetadata().requestId() : "Unknown") - .usage(usage) - .build(); + .usage(usage); + + // Add cache metrics if available + Map additionalMetadata = new HashMap<>(); + if (response.usage().cacheReadInputTokens() != null) { + additionalMetadata.put("cacheReadInputTokens", response.usage().cacheReadInputTokens()); + } + if (response.usage().cacheWriteInputTokens() != null) { + additionalMetadata.put("cacheWriteInputTokens", response.usage().cacheWriteInputTokens()); + } + if (!additionalMetadata.isEmpty()) { + metadataBuilder.metadata(additionalMetadata); + } - return new ChatResponse(allGenerations, chatResponseMetaData); + return new ChatResponse(allGenerations, metadataBuilder.build()); } /** diff --git a/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/api/BedrockCacheOptions.java b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/api/BedrockCacheOptions.java new file mode 100644 index 00000000000..cb15ff6ab03 --- /dev/null +++ b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/api/BedrockCacheOptions.java @@ -0,0 +1,110 @@ +/* + * Copyright 2023-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.springframework.ai.bedrock.converse.api; + +/** + * AWS Bedrock cache options for configuring prompt caching behavior. + * + *

+ * Prompt caching allows you to reduce latency and costs by reusing previously processed + * prompt content. Cached content has a fixed 5-minute Time To Live (TTL) that resets with + * each cache hit. + * + *

+ * Example usage: + * + *

{@code
+ * BedrockCacheOptions cacheOptions = BedrockCacheOptions.builder()
+ *     .strategy(BedrockCacheStrategy.SYSTEM_ONLY)
+ *     .build();
+ *
+ * ChatResponse response = chatModel.call(new Prompt(
+ *     List.of(new SystemMessage(largeSystemPrompt), new UserMessage("Question")),
+ *     BedrockChatOptions.builder()
+ *         .cacheOptions(cacheOptions)
+ *         .build()
+ * ));
+ * }
+ * + * @author Soby Chacko + * @since 1.1.0 + * @see BedrockCacheStrategy + * @see AWS Bedrock + * Prompt Caching + */ +public class BedrockCacheOptions { + + private BedrockCacheStrategy strategy = BedrockCacheStrategy.NONE; + + /** + * Creates a new builder for constructing BedrockCacheOptions. + * @return a new Builder instance + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Gets the caching strategy. + * @return the configured BedrockCacheStrategy + */ + public BedrockCacheStrategy getStrategy() { + return this.strategy; + } + + /** + * Sets the caching strategy. + * @param strategy the BedrockCacheStrategy to use + */ + public void setStrategy(BedrockCacheStrategy strategy) { + this.strategy = strategy; + } + + @Override + public String toString() { + return "BedrockCacheOptions{" + "strategy=" + this.strategy + '}'; + } + + /** + * Builder for constructing BedrockCacheOptions instances. + */ + public static class Builder { + + private final BedrockCacheOptions options = new BedrockCacheOptions(); + + /** + * Sets the caching strategy. + * @param strategy the BedrockCacheStrategy to use + * @return this Builder instance + */ + public Builder strategy(BedrockCacheStrategy strategy) { + this.options.setStrategy(strategy); + return this; + } + + /** + * Builds the BedrockCacheOptions instance. + * @return the configured BedrockCacheOptions + */ + public BedrockCacheOptions build() { + return this.options; + } + + } + +} diff --git a/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/api/BedrockCacheStrategy.java b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/api/BedrockCacheStrategy.java new file mode 100644 index 00000000000..ba76d9c933c --- /dev/null +++ b/models/spring-ai-bedrock-converse/src/main/java/org/springframework/ai/bedrock/converse/api/BedrockCacheStrategy.java @@ -0,0 +1,187 @@ +/* + * Copyright 2023-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.springframework.ai.bedrock.converse.api; + +/** + * Defines the caching strategy for AWS Bedrock prompt caching. Bedrock allows up to 4 + * cache breakpoints per request, and the cache hierarchy follows the order: tools → + * system → messages. + * + *

+ * Prompt caching reduces latency and costs by reusing previously processed prompt + * content. Cached content has a 5-minute Time To Live (TTL) that resets with each cache + * hit. + * + * @author Soby Chacko + * @since 1.1.0 + * @see AWS Bedrock + * Prompt Caching + */ +public enum BedrockCacheStrategy { + + /** + * No caching (default behavior). All content is processed fresh on each request. + *

+ * Use this when: + *

    + *
  • Requests are one-off or highly variable
  • + *
  • Content doesn't meet minimum token requirements (1024+ tokens for most + * models)
  • + *
  • You want to avoid caching overhead
  • + *
+ */ + NONE, + + /** + * Cache system instructions only. Places a cache breakpoint on the system message + * content. Tools are cached implicitly via Bedrock's automatic ~20-block lookback + * mechanism (content before the cache breakpoint is included in the cache). + *

+ * Use this when: + *

    + *
  • System prompts are large and stable (1024+ tokens)
  • + *
  • Tool definitions are relatively small (<20 tools)
  • + *
  • You want simple, single-breakpoint caching
  • + *
+ *

+ * Note: Changing tools will invalidate the cache since tools are + * part of the cache prefix (they appear before system in the request hierarchy). + *

+ * This is the recommended starting point for most use cases as it provides the best + * balance of simplicity and effectiveness. + */ + SYSTEM_ONLY, + + /** + * Cache tool definitions only. Places a cache breakpoint after the last tool + * definition. System messages and conversation history are not cached. + *

+ * Use this when: + *

    + *
  • You have many tool definitions (20+ tools, 1024+ tokens total)
  • + *
  • Tools are stable but system prompts change frequently
  • + *
  • You want to cache tool schemas without caching system instructions
  • + *
+ *

+ * Important Model Compatibility: + *

    + *
  • Supported: Claude 3.x and Claude 4.x models (all + * variants)
  • + *
  • Not Supported: Amazon Nova models (Nova Micro, Lite, Pro, + * Premier) - these models only support caching for system and messages, not + * tools
  • + *
+ *

+ * If you use this strategy with an unsupported model, AWS will return a + * ValidationException. Use {@link #SYSTEM_ONLY} instead for Amazon Nova models. + *

+ * Note: If no tools are present in the request, this strategy is + * equivalent to NONE (no caching occurs). + */ + TOOLS_ONLY, + + /** + * Cache both tool definitions and system instructions. Places two cache breakpoints: + * one after the last tool definition, and one after the last system message. + *

+ * Use this when: + *

    + *
  • Both tools and system prompts are large and stable (1024+ tokens each)
  • + *
  • You want maximum cache coverage
  • + *
  • You're willing to use 2 of your 4 available cache breakpoints
  • + *
+ *

+ * Important Model Compatibility: + *

    + *
  • Supported: Claude 3.x and Claude 4.x models (all + * variants)
  • + *
  • Not Supported: Amazon Nova models (Nova Micro, Lite, Pro, + * Premier) - these models only support caching for system and messages, not + * tools
  • + *
+ *

+ * If you use this strategy with an unsupported model, AWS will return a + * ValidationException. Use {@link #SYSTEM_ONLY} instead for Amazon Nova models. + *

+ * Cache Invalidation: + *

    + *
  • Changing tools invalidates both cache breakpoints (tools are the prefix)
  • + *
  • Changing system prompts only invalidates the system cache (tools remain + * cached)
  • + *
+ *

+ * This provides the most comprehensive caching but uses more cache breakpoints. + */ + SYSTEM_AND_TOOLS, + + /** + * Cache the entire conversation history up to and including the current user + * question. This is ideal for multi-turn conversations where you want to reuse the + * conversation context while asking new questions. + *

+ * A cache breakpoint is placed on the last user message in the conversation. This + * enables incremental caching where each conversation turn builds on the previous + * cached prefix, providing significant cost savings and performance improvements. + *

+ * Use this when: + *

    + *
  • Building multi-turn conversational applications (chatbots, assistants)
  • + *
  • Conversation history is substantial (1024+ tokens)
  • + *
  • Users are asking follow-up questions that require context from earlier + * messages
  • + *
  • You want to reduce latency and costs for ongoing conversations
  • + *
+ *

+ * Model Compatibility: + *

    + *
  • Verified: Claude 3.x and Claude 4.x models (all variants)
  • + *
  • Note: Amazon Nova models theoretically support conversation + * caching, but have not been verified in integration tests
  • + *
+ *

+ * How it works: + *

    + *
  1. Identifies the last user message in the conversation
  2. + *
  3. Places cache breakpoint as the last content block on that message
  4. + *
  5. All messages up to and including the last user message are cached (system, + * previous user/assistant turns, and current user question)
  6. + *
  7. On the next turn, the cached context is reused and a new cache is created + * including the assistant response and new user question
  8. + *
+ *

+ * Example conversation flow: + * + *

+	 * Turn 1: "My name is Alice" → Response cached
+	 * Turn 2: "I work as a data scientist" → Response cached
+	 * Turn 3: "What career advice would you give me?" ← Cache applies here
+	 *         (Turns 1-2 are read from cache, Turn 3 question is fresh)
+	 * 
+ *

+ * Cache behavior: + *

    + *
  • First request: Creates cache (cacheWriteInputTokens > 0)
  • + *
  • Subsequent requests: Reads from cache (cacheReadInputTokens > 0)
  • + *
  • Cache TTL: 5 minutes (resets on each cache hit)
  • + *
  • Minimum content: 1024+ tokens required for caching to activate
  • + *
+ *

+ */ + CONVERSATION_HISTORY + +} diff --git a/models/spring-ai-bedrock-converse/src/test/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModelIT.java b/models/spring-ai-bedrock-converse/src/test/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModelIT.java index 2b2361cba03..1ede468204c 100644 --- a/models/spring-ai-bedrock-converse/src/test/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModelIT.java +++ b/models/spring-ai-bedrock-converse/src/test/java/org/springframework/ai/bedrock/converse/BedrockProxyChatModelIT.java @@ -31,9 +31,12 @@ import org.slf4j.LoggerFactory; import reactor.core.publisher.Flux; +import org.springframework.ai.bedrock.converse.api.BedrockCacheOptions; +import org.springframework.ai.bedrock.converse.api.BedrockCacheStrategy; import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.chat.messages.AssistantMessage; import org.springframework.ai.chat.messages.Message; +import org.springframework.ai.chat.messages.SystemMessage; import org.springframework.ai.chat.messages.UserMessage; import org.springframework.ai.chat.model.ChatModel; import org.springframework.ai.chat.model.ChatResponse; @@ -366,6 +369,478 @@ void validateStreamCallResponseMetadata() { validateChatResponseMetadata(response, model); } + @Test + void testSystemOnlyPromptCaching() { + // NOTE: Prompt caching is supported by the following models (as of 2025): + // - Claude 3 Opus 4.1, Claude Opus 4, Claude Sonnet 4.5, Claude Sonnet 4, Claude + // 3.7 Sonnet + // - Claude 3.5 Haiku, Claude 3.5 Sonnet v2 + // - Amazon Nova Micro, Lite, Pro, Premier + // + // IMPORTANT: Newer Claude models require AWS Bedrock inference profiles instead + // of direct model IDs. + // If you get ValidationException about "on-demand throughput isn't supported", + // you need to: + // 1. Use an inference profile ARN/ID (e.g., + // "us.anthropic.claude-3-5-haiku-20241022-v1:0") + // 2. Ensure your AWS account/region has cross-region inference profiles enabled + // 3. Or use Amazon Nova models which work with direct model IDs + // + // Amazon Nova models work without inference profiles and are used in this test + // for reliability. + String model = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"; + + // Create a large system prompt (needs to exceed minimum token threshold for + // caching) + // Amazon Nova models require 1024+ tokens for caching to activate + // Each repetition adds ~160 tokens, so 7 repetitions = ~1120 tokens + String basePrompt = """ + You are an expert software architect with deep knowledge of distributed systems, + microservices, cloud computing, and software design patterns. Your role is to provide + detailed technical guidance on system architecture, design decisions, and best practices. + + Key areas of expertise: + - Distributed systems design and architecture + - Microservices patterns and anti-patterns + - Cloud-native application development + - Event-driven architectures + - Database design and scaling strategies + - API design and RESTful services + - Security best practices + - Performance optimization and scalability + + """; + + // Repeat to exceed 1024 token minimum (approximate: 1 token ≈ 4 characters) + StringBuilder largeSystemPromptBuilder = new StringBuilder(); + for (int i = 0; i < 12; i++) { + largeSystemPromptBuilder.append(basePrompt); + } + largeSystemPromptBuilder.append("When answering questions, provide clear, structured responses with examples."); + + String largeSystemPrompt = largeSystemPromptBuilder.toString(); + + BedrockCacheOptions cacheOptions = BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build(); + + BedrockChatOptions chatOptions = BedrockChatOptions.builder() + .model(model) + .cacheOptions(cacheOptions) + .maxTokens(500) + .build(); + + // First request - should create cache + ChatResponse response1 = this.chatModel.call(new Prompt( + List.of(new SystemMessage(largeSystemPrompt), new UserMessage("What is a monolith?")), chatOptions)); + + // Verify first response is valid + assertThat(response1.getResults()).hasSize(1); + assertThat(response1.getResult().getOutput().getText()).isNotEmpty(); + + // Verify cache write tokens are present and positive (cache was created) + Integer cacheWrite1 = response1.getMetadata().get("cacheWriteInputTokens"); + logger.info("First request - cacheWriteInputTokens: {}", cacheWrite1); + assertThat(cacheWrite1).as("First request should write tokens to cache").isNotNull().isPositive(); + + // Verify no cache read on first request + Integer cacheRead1 = response1.getMetadata().get("cacheReadInputTokens"); + assertThat(cacheRead1).as("First request should not read from cache").isIn(null, 0); + + // Second request with same system prompt - should hit cache + ChatResponse response2 = this.chatModel + .call(new Prompt(List.of(new SystemMessage(largeSystemPrompt), new UserMessage("What is a microservice?")), + chatOptions)); + + // Verify second response is valid + assertThat(response2.getResults()).hasSize(1); + assertThat(response2.getResult().getOutput().getText()).isNotEmpty(); + + // Verify cache read tokens are present and positive (cache was used) + Integer cacheRead2 = response2.getMetadata().get("cacheReadInputTokens"); + logger.info("Second request - cacheReadInputTokens: {}", cacheRead2); + assertThat(cacheRead2).as("Second request should read tokens from cache").isNotNull().isPositive(); + + // Verify cache read matches what was written + assertThat(cacheRead2).as("Cache read tokens should match cache write tokens").isEqualTo(cacheWrite1); + + // Verify no cache write on second request (reusing existing cache) + Integer cacheWrite2 = response2.getMetadata().get("cacheWriteInputTokens"); + assertThat(cacheWrite2).as("Second request should not write new tokens to cache").isIn(null, 0); + } + + @Test + void testToolsOnlyPromptCaching() { + // NOTE: Testing tools-only caching requires multiple large tool definitions to + // exceed 1K tokens + // IMPORTANT: This test requires a Claude model (e.g., Claude 3.5 Haiku, Claude + // 3.7 Sonnet) + // Amazon Nova models do NOT support tool caching and will return + // ValidationException + String model = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"; + + // Create multiple tool callbacks to exceed the 1K token minimum for caching + // Each tool definition adds ~200-300 tokens, so we need 4-5 tools + List toolCallbacks = createLargeToolCallbacks(); + + BedrockCacheOptions cacheOptions = BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.TOOLS_ONLY) + .build(); + + BedrockChatOptions chatOptions = BedrockChatOptions.builder() + .model(model) + .cacheOptions(cacheOptions) + .toolCallbacks(List.copyOf(toolCallbacks)) + .maxTokens(500) + .build(); + + // First request - should create cache for tools + ChatResponse response1 = this.chatModel.call(new Prompt("What's the weather in Paris?", chatOptions)); + + // Verify first response is valid + assertThat(response1.getResults()).hasSize(1); + assertThat(response1.getResult().getOutput().getText()).isNotEmpty(); + + // Extract cache metrics from first request + Integer cacheWrite1 = response1.getMetadata().get("cacheWriteInputTokens"); + Integer cacheRead1 = response1.getMetadata().get("cacheReadInputTokens"); + logger.info("First request - cacheWriteInputTokens: {}, cacheReadInputTokens: {}", cacheWrite1, cacheRead1); + + // The first request may either: + // 1. Create a new cache (cacheWrite > 0, cacheRead = 0) if no prior cache exists + // 2. Use existing cache (cacheRead > 0) if previous test ran within 5min TTL + // At least one should be positive to confirm caching is working + int firstRequestCache = (cacheWrite1 != null ? cacheWrite1 : 0) + (cacheRead1 != null ? cacheRead1 : 0); + assertThat(firstRequestCache).as("First request should either write or read from cache").isPositive(); + + // Second request with same tools - should hit cache + ChatResponse response2 = this.chatModel.call(new Prompt("What's the weather in Tokyo?", chatOptions)); + + // Verify second response is valid + assertThat(response2.getResults()).hasSize(1); + assertThat(response2.getResult().getOutput().getText()).isNotEmpty(); + + // Verify cache read tokens are present (tools were read from cache) + Integer cacheRead2 = response2.getMetadata().get("cacheReadInputTokens"); + logger.info("Second request - cacheReadInputTokens: {}", cacheRead2); + assertThat(cacheRead2).as("Second request should read tool definitions from cache").isNotNull().isPositive(); + + // Verify the second request uses the same cache as was established in first + // request + int expectedTotalCache = (cacheWrite1 != null ? cacheWrite1 : 0) + (cacheRead1 != null ? cacheRead1 : 0); + assertThat(cacheRead2).as("Second request should read the same total cache").isEqualTo(expectedTotalCache); + } + + @Test + void testSystemAndToolsPromptCaching() { + // NOTE: Testing combined caching requires both large system prompt and multiple + // tools + // IMPORTANT: This test requires a Claude model (e.g., Claude 3.5 Haiku, Claude + // 3.7 Sonnet) + // Amazon Nova models do NOT support tool caching and will return + // ValidationException + String model = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"; + + // Create large system prompt (1K+ tokens) + String basePrompt = """ + You are an expert weather analyst with deep knowledge of meteorology, + climate patterns, and weather forecasting. Your role is to provide detailed + weather analysis and recommendations. + + Key areas of expertise: + - Weather pattern analysis and forecasting + - Climate change impacts on weather + - Severe weather prediction and safety + - Seasonal weather trends + - Microclimate analysis + - Weather data interpretation + - Agricultural weather impacts + - Travel and event weather planning + + """; + + StringBuilder largeSystemPromptBuilder = new StringBuilder(); + for (int i = 0; i < 12; i++) { + largeSystemPromptBuilder.append(basePrompt); + } + largeSystemPromptBuilder.append("Provide detailed weather analysis with context and recommendations."); + + String largeSystemPrompt = largeSystemPromptBuilder.toString(); + + // Create multiple tool callbacks + List toolCallbacks = createLargeToolCallbacks(); + + BedrockCacheOptions cacheOptions = BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_AND_TOOLS) + .build(); + + BedrockChatOptions chatOptions = BedrockChatOptions.builder() + .model(model) + .cacheOptions(cacheOptions) + .toolCallbacks(List.copyOf(toolCallbacks)) + .maxTokens(500) + .build(); + + // First request - should create cache for both tools and system + ChatResponse response1 = this.chatModel.call(new Prompt( + List.of(new SystemMessage(largeSystemPrompt), new UserMessage("What's the weather in Paris?")), + chatOptions)); + + // Verify first response is valid + assertThat(response1.getResults()).hasSize(1); + assertThat(response1.getResult().getOutput().getText()).isNotEmpty(); + + // Extract cache metrics from first request + Integer cacheWrite1 = response1.getMetadata().get("cacheWriteInputTokens"); + Integer cacheRead1 = response1.getMetadata().get("cacheReadInputTokens"); + logger.info("First request - cacheWriteInputTokens: {}, cacheReadInputTokens: {}", cacheWrite1, cacheRead1); + + // The first request may either: + // 1. Create a new cache (cacheWrite > 0, cacheRead = 0) if no prior cache exists + // 2. Use existing cache (cacheRead > 0) if previous test ran within 5min TTL + // At least one should be positive to confirm caching is working + int firstRequestCache = (cacheWrite1 != null ? cacheWrite1 : 0) + (cacheRead1 != null ? cacheRead1 : 0); + assertThat(firstRequestCache).as("First request should either write or read from cache").isPositive(); + + // Second request with same tools and system - should hit both caches + ChatResponse response2 = this.chatModel.call(new Prompt( + List.of(new SystemMessage(largeSystemPrompt), new UserMessage("What's the weather in Tokyo?")), + chatOptions)); + + // Verify second response is valid + assertThat(response2.getResults()).hasSize(1); + assertThat(response2.getResult().getOutput().getText()).isNotEmpty(); + + // Verify cache read tokens are present (both caches were used) + Integer cacheRead2 = response2.getMetadata().get("cacheReadInputTokens"); + Integer cacheWrite2 = response2.getMetadata().get("cacheWriteInputTokens"); + logger.info("Second request - cacheReadInputTokens: {}, cacheWriteInputTokens: {}", cacheRead2, cacheWrite2); + assertThat(cacheRead2).as("Second request should read from both caches").isNotNull().isPositive(); + + // Verify the second request uses the same cache as was established in first + // request + // The total cache should be: what was written in first + what was read in first + int expectedTotalCache = (cacheWrite1 != null ? cacheWrite1 : 0) + (cacheRead1 != null ? cacheRead1 : 0); + assertThat(cacheRead2).as("Second request should read the same total cache").isEqualTo(expectedTotalCache); + + // The combined cache should be substantial (tools + system > 3000 tokens) + assertThat(cacheRead2).as("Combined cache should be substantial").isGreaterThan(3000); + } + + @Test + void testConversationHistoryPromptCachingWithClaude() { + // NOTE: Conversation history caching is verified to work with Claude models + // Amazon Nova models theoretically support this but haven't been verified in + // tests + String model = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"; + + // Create a large system prompt to contribute to total token count + // Need 1024+ tokens total for caching to activate + String systemPrompt = """ + You are a helpful AI assistant with expertise in career counseling and professional development. + You remember details from our conversation and use them to provide personalized responses. + Always acknowledge information shared by the user in previous messages when relevant to the current question. + Your advice should be specific, actionable, and tailored to the user's background, industry, and goals. + When providing career guidance, consider market trends, skill development, networking, and work-life balance. + """; + + // Repeat system prompt to ensure we have enough tokens (need 1024+ total) + // Claude tokenizes efficiently, so we need many repetitions + StringBuilder largeSystemPromptBuilder = new StringBuilder(); + for (int i = 0; i < 15; i++) { + largeSystemPromptBuilder.append(systemPrompt); + } + String largeSystemPrompt = largeSystemPromptBuilder.toString(); + + // Build conversation history with multiple turns to exceed token minimum + // Each turn adds context that should be cached + List conversationHistory = new ArrayList<>(); + conversationHistory.add(new SystemMessage(largeSystemPrompt)); + conversationHistory + .add(new UserMessage("My name is Alice and I work as a data scientist at TechCorp in San Francisco.")); + conversationHistory.add(new AssistantMessage("Nice to meet you, Alice! It's great to hear you work as a " + + "data scientist at TechCorp in San Francisco. Data science is such an exciting field. " + + "How long have you been working there?")); + conversationHistory.add(new UserMessage( + "I've been there for 3 years. I specialize in machine learning and natural language processing.")); + conversationHistory.add(new AssistantMessage("That's wonderful, Alice! Three years at TechCorp working on ML " + + "and NLP is impressive. Those are cutting-edge areas of data science. " + + "What kind of NLP projects do you typically work on?")); + conversationHistory.add(new UserMessage( + "Recently I've been building a recommendation system that analyzes user behavior and preferences.")); + conversationHistory + .add(new AssistantMessage("A recommendation system is a fantastic application of your ML and NLP skills! " + + "Analyzing user behavior and preferences can really enhance user experience. " + + "Are you using collaborative filtering, content-based methods, or hybrid approaches?")); + + // NOW add the current user question with CONVERSATION_HISTORY caching enabled + // This will cache all previous conversation turns + conversationHistory + .add(new UserMessage("Based on what I've told you about my work, what career advice would you give me?")); + + BedrockCacheOptions cacheOptions = BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.CONVERSATION_HISTORY) + .build(); + + BedrockChatOptions chatOptions = BedrockChatOptions.builder() + .model(model) + .cacheOptions(cacheOptions) + .maxTokens(500) + .build(); + + // First request - should create cache for conversation history + ChatResponse response1 = this.chatModel.call(new Prompt(conversationHistory, chatOptions)); + + // Verify first response is valid + assertThat(response1.getResults()).hasSize(1); + assertThat(response1.getResult().getOutput().getText()).isNotEmpty(); + + // Verify response references the context (Alice, data scientist, etc.) + String responseText1 = response1.getResult().getOutput().getText().toLowerCase(); + logger.info("First response: {}", responseText1); + + // Extract cache metrics from first request + Integer cacheWrite1 = response1.getMetadata().get("cacheWriteInputTokens"); + Integer cacheRead1 = response1.getMetadata().get("cacheReadInputTokens"); + logger.info("First request - cacheWriteInputTokens: {}, cacheReadInputTokens: {}", cacheWrite1, cacheRead1); + + // The first request may either: + // 1. Create a new cache (cacheWrite > 0, cacheRead = 0) if no prior cache + // exists + // 2. Use existing cache (cacheRead > 0) if previous test ran within 5min TTL + int firstRequestCache = (cacheWrite1 != null ? cacheWrite1 : 0) + (cacheRead1 != null ? cacheRead1 : 0); + assertThat(firstRequestCache).as("First request should either write or read from cache").isPositive(); + + // Second request: Continue the conversation with a follow-up question + // The conversation history should be read from cache + List extendedConversation = new ArrayList<>(conversationHistory); + extendedConversation.add(response1.getResult().getOutput()); // Add assistant's + // response + extendedConversation.add(new UserMessage("What skills should I focus on developing to advance in my career?")); + + ChatResponse response2 = this.chatModel.call(new Prompt(extendedConversation, chatOptions)); + + // Verify second response is valid + assertThat(response2.getResults()).hasSize(1); + assertThat(response2.getResult().getOutput().getText()).isNotEmpty(); + + // Verify cache read tokens are present + Integer cacheRead2 = response2.getMetadata().get("cacheReadInputTokens"); + logger.info("Second request - cacheReadInputTokens: {}", cacheRead2); + assertThat(cacheRead2).as("Second request should read conversation history from cache") + .isNotNull() + .isPositive(); + + // The cache should be substantial (conversation history > 500 tokens) + assertThat(cacheRead2).as("Conversation cache should be substantial").isGreaterThan(500); + } + + /** + * Helper method to create multiple tool callbacks to exceed 1K token minimum for + * caching. Creates 5 different weather-related tools with verbose descriptions to + * ensure sufficient token count for Claude models (which tokenize more efficiently + * than Nova models). + */ + private List createLargeToolCallbacks() { + return List.of(FunctionToolCallback.builder("getCurrentWeather", new MockWeatherService()).description(""" + Get the current weather conditions for a specific location anywhere in the world. + This comprehensive weather service provides real-time meteorological data including: + - Current temperature in Celsius and Fahrenheit with feels-like temperature + - Humidity levels and dew point information + - Atmospheric pressure readings (both sea level and station pressure) + - Wind speed, direction, and gusts information + - Cloud coverage percentage and type (cumulus, stratus, cirrus, etc.) + - Visibility distance in kilometers and miles + - Current precipitation status (rain, snow, sleet, hail) + - UV index and solar radiation levels + - Air quality index (AQI) and pollutant concentrations + - Sunrise and sunset times for the location + The service uses data from multiple meteorological stations and satellites to ensure + accuracy and reliability. Data is updated every 15 minutes for most locations worldwide. + """).inputType(MockWeatherService.Request.class).build(), FunctionToolCallback + .builder("getWeatherForecast", new MockWeatherService()) + .description(""" + Get the weather forecast for the next 7 days for a specific location with detailed predictions. + This advanced forecasting service provides comprehensive weather predictions including: + - Daily high and low temperatures with hourly breakdowns + - Precipitation probability percentage for each day and hour + - Expected precipitation amounts (rain, snow) in millimeters and inches + - Wind forecasts including speed, direction, and gust predictions + - Cloud coverage predictions and sky conditions (sunny, partly cloudy, overcast) + - Humidity levels and heat index/wind chill calculations + - Severe weather warnings and advisories if applicable + - Sunrise and sunset times for each day + - Moon phase information for planning outdoor activities + - Detailed text descriptions of expected conditions for each day + The forecast uses advanced meteorological models combining numerical weather prediction, + machine learning algorithms, and historical climate data to provide highly accurate + predictions. Forecasts are updated four times daily with improving accuracy for near-term + predictions and reasonable accuracy extending to 7 days out. + """) + .inputType(MockWeatherService.Request.class) + .build(), FunctionToolCallback.builder("getHistoricalWeather", new MockWeatherService()).description(""" + Get historical weather data for a specific location and date range with comprehensive analysis. + This powerful historical weather service provides access to decades of weather records including: + - Temperature records: daily highs, lows, and averages for any date range + - Precipitation history: rainfall and snowfall amounts with accumulation totals + - Temperature trend analysis comparing to long-term averages and records + - Extreme weather events: heat waves, cold snaps, severe storms in the time period + - Climate comparisons showing how conditions compare to historical norms + - Monthly and seasonal summaries with statistical analysis + - Detailed day-by-day weather observations from official weather stations + - Notable weather events and their impacts during the requested time period + The historical data is sourced from official meteorological agencies and weather stations + with records extending back multiple decades. This tool is invaluable for understanding + climate trends, planning activities based on historical patterns, agricultural planning, + research purposes, and understanding how current weather compares to historical context. + Data quality indicators are provided to show the reliability of older records. + """).inputType(MockWeatherService.Request.class).build(), + FunctionToolCallback.builder("getWeatherAlerts", new MockWeatherService()) + .description( + """ + Get active weather alerts and warnings for a specific location with critical safety information. + This essential safety service provides real-time alerts from official meteorological services including: + - Severe thunderstorm warnings with timing and intensity information + - Tornado warnings and watches with affected areas and safety instructions + - Hurricane and tropical storm alerts with projected paths and wind speeds + - Flash flood warnings and flood watches with affected waterways + - Winter storm warnings including snow, ice, and blizzard conditions + - Heat advisories and excessive heat warnings with health recommendations + - Wind advisories and high wind warnings with expected peak gusts + - Dense fog advisories affecting visibility and travel + - Air quality alerts for unhealthy pollution levels + - Fire weather warnings for dangerous wildfire conditions + Each alert includes the official alert level (advisory, watch, warning), affected geographic + areas, start and end times, detailed descriptions of the hazard, recommended actions for + safety, and contact information for local emergency management. Alerts are issued by + official national weather services and are updated in real-time as conditions evolve. + This service is critical for public safety and emergency preparedness. + """) + .inputType(MockWeatherService.Request.class) + .build(), + FunctionToolCallback.builder("getClimateData", new MockWeatherService()).description(""" + Get long-term climate data and comprehensive statistics for a specific location. + This climate analysis service provides in-depth climatological information including: + - Long-term average temperatures: monthly and annual means over 30+ year periods + - Precipitation patterns: average rainfall and snowfall by month and season + - Seasonal trend analysis showing typical weather patterns throughout the year + - Climate classification according to Köppen-Geiger system + - Record high and low temperatures for each month with dates + - Average humidity levels, cloud coverage, and sunshine hours + - Wind patterns including prevailing wind directions and average speeds + - Growing season length and frost dates important for agriculture + - Climate change indicators showing temperature and precipitation trends + - Extreme weather frequency: how often severe events typically occur + - Comparison with global and regional climate averages + - Microclimate variations within the region based on elevation and geography + - Best and worst months for various outdoor activities based on climate + This comprehensive climate data is essential for long-term planning, understanding regional + climate characteristics, agricultural planning, construction projects, tourism planning, + and understanding local climate change impacts. Data is derived from decades of official + meteorological observations and is continuously updated as new climate normals are established. + """).inputType(MockWeatherService.Request.class).build()); + } + record ActorsFilmsRecord(String actor, List movies) { } diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/bedrock-converse.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/bedrock-converse.adoc index 2be1b4ed86b..d281a4568f3 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/bedrock-converse.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/bedrock-converse.adoc @@ -133,6 +133,593 @@ String response = ChatClient.create(this.chatModel) .content(); ---- +== Prompt Caching + +AWS Bedrock's https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html[prompt caching feature] allows you to cache frequently used prompts to reduce costs and improve response times for repeated interactions. +When you cache a prompt, subsequent identical requests can reuse the cached content, significantly reducing the number of input tokens processed. + +[NOTE] +==== +*Supported Models* + +Prompt caching is supported on Claude 3.x, Claude 4.x, and Amazon Nova models available through AWS Bedrock. + +*Token Requirements* + +Different models have different minimum token thresholds for cache effectiveness: +- Claude Sonnet 4 and most models: 1024+ tokens +- Model-specific requirements may vary - consult AWS Bedrock documentation +==== + +=== Cache Strategies + +Spring AI provides strategic cache placement through the `BedrockCacheStrategy` enum: + +* `NONE`: Disables prompt caching completely (default) +* `SYSTEM_ONLY`: Caches only the system message content +* `TOOLS_ONLY`: Caches tool definitions only (Claude models only) +* `SYSTEM_AND_TOOLS`: Caches both system message and tool definitions (Claude models only) +* `CONVERSATION_HISTORY`: Caches entire conversation history in chat memory scenarios + +This strategic approach ensures optimal cache breakpoint placement while staying within AWS Bedrock's 4-breakpoint limit. + +[NOTE] +==== +*Amazon Nova Limitations* + +Amazon Nova models (Nova Micro, Lite, Pro, Premier) only support caching for `system` and `messages` content. +They do **not** support caching for `tools`. + +If you attempt to use `TOOLS_ONLY` or `SYSTEM_AND_TOOLS` strategies with Nova models, AWS will return a `ValidationException`. +Use `SYSTEM_ONLY` strategy for Amazon Nova models. +==== + +=== Enabling Prompt Caching + +Enable prompt caching by setting `cacheOptions` on `BedrockChatOptions` and choosing a `strategy`. + +==== System-Only Caching + +The most common use case - cache system instructions across multiple requests: + +[source,java] +---- +// Cache system message content +ChatResponse response = chatModel.call( + new Prompt( + List.of( + new SystemMessage("You are a helpful AI assistant with extensive knowledge..."), + new UserMessage("What is machine learning?") + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(500) + .build() + ) +); +---- + +==== Tools-Only Caching + +Cache large tool definitions while keeping system prompts dynamic (Claude models only): + +[source,java] +---- +// Cache tool definitions only +ChatResponse response = chatModel.call( + new Prompt( + "What's the weather in San Francisco?", + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.TOOLS_ONLY) + .build()) + .toolCallbacks(weatherToolCallbacks) // Large tool definitions + .maxTokens(500) + .build() + ) +); +---- + +NOTE: This strategy is only supported on Claude models. +Amazon Nova models will return a `ValidationException`. + +==== System and Tools Caching + +Cache both system instructions and tool definitions for maximum reuse (Claude models only): + +[source,java] +---- +// Cache system message and tool definitions +ChatResponse response = chatModel.call( + new Prompt( + List.of( + new SystemMessage("You are a weather analysis assistant..."), + new UserMessage("What's the weather like in Tokyo?") + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_AND_TOOLS) + .build()) + .toolCallbacks(weatherToolCallbacks) + .maxTokens(500) + .build() + ) +); +---- + +NOTE: This strategy uses 2 cache breakpoints (one for tools, one for system). +Only supported on Claude models. + +==== Conversation History Caching + +Cache growing conversation history for multi-turn chatbots and assistants: + +[source,java] +---- +// Cache conversation history with ChatClient and memory +ChatClient chatClient = ChatClient.builder(chatModel) + .defaultSystem("You are a personalized career counselor...") + .defaultAdvisors(MessageChatMemoryAdvisor.builder(chatMemory) + .conversationId(conversationId) + .build()) + .build(); + +String response = chatClient.prompt() + .user("What career advice would you give me?") + .options(BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.CONVERSATION_HISTORY) + .build()) + .maxTokens(500) + .build()) + .call() + .content(); +---- + +==== Using ChatClient Fluent API + +[source,java] +---- +String response = ChatClient.create(chatModel) + .prompt() + .system("You are an expert document analyst...") + .user("Analyze this large document: " + document) + .options(BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .build()) + .call() + .content(); +---- + +=== Usage Example + +Here's a complete example demonstrating prompt caching with cost tracking: + +[source,java] +---- +// Create system content that will be reused multiple times +String largeSystemPrompt = "You are an expert software architect specializing in distributed systems..."; +// (Ensure this is 1024+ tokens for cache effectiveness) + +// First request - creates cache +ChatResponse firstResponse = chatModel.call( + new Prompt( + List.of( + new SystemMessage(largeSystemPrompt), + new UserMessage("What is microservices architecture?") + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(500) + .build() + ) +); + +// Access cache-related token usage from metadata +Integer cacheWrite1 = (Integer) firstResponse.getMetadata() + .getMetadata() + .get("cacheWriteInputTokens"); +Integer cacheRead1 = (Integer) firstResponse.getMetadata() + .getMetadata() + .get("cacheReadInputTokens"); + +System.out.println("Cache creation tokens: " + cacheWrite1); +System.out.println("Cache read tokens: " + cacheRead1); + +// Second request with same system prompt - reads from cache +ChatResponse secondResponse = chatModel.call( + new Prompt( + List.of( + new SystemMessage(largeSystemPrompt), // Same prompt - cache hit + new UserMessage("What are the benefits of event sourcing?") + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(500) + .build() + ) +); + +Integer cacheWrite2 = (Integer) secondResponse.getMetadata() + .getMetadata() + .get("cacheWriteInputTokens"); +Integer cacheRead2 = (Integer) secondResponse.getMetadata() + .getMetadata() + .get("cacheReadInputTokens"); + +System.out.println("Cache creation tokens: " + cacheWrite2); // Should be 0 +System.out.println("Cache read tokens: " + cacheRead2); // Should be > 0 +---- + +=== Token Usage Tracking + +AWS Bedrock provides cache-specific metrics through the response metadata. +Cache metrics are accessible via the metadata Map: + +[source,java] +---- +ChatResponse response = chatModel.call(/* ... */); + +// Access cache metrics from metadata Map +Integer cacheWrite = (Integer) response.getMetadata() + .getMetadata() + .get("cacheWriteInputTokens"); +Integer cacheRead = (Integer) response.getMetadata() + .getMetadata() + .get("cacheReadInputTokens"); +---- + +Cache-specific metrics include: + +* `cacheWriteInputTokens`: Returns the number of tokens used when creating a cache entry +* `cacheReadInputTokens`: Returns the number of tokens read from an existing cache entry + +When you first send a cached prompt: +- `cacheWriteInputTokens` will be greater than 0 +- `cacheReadInputTokens` will be 0 + +When you send the same cached prompt again (within 5-minute TTL): +- `cacheWriteInputTokens` will be 0 +- `cacheReadInputTokens` will be greater than 0 + +=== Real-World Use Cases + +==== Legal Document Analysis + +Analyze large legal contracts or compliance documents efficiently by caching document content across multiple questions: + +[source,java] +---- +// Load a legal contract (PDF or text) +String legalContract = loadDocument("merger-agreement.pdf"); // ~3000 tokens + +// System prompt with legal expertise +String legalSystemPrompt = "You are an expert legal analyst specializing in corporate law. " + + "Analyze the following contract and provide precise answers about terms, obligations, and risks: " + + legalContract; + +// First analysis - creates cache +ChatResponse riskAnalysis = chatModel.call( + new Prompt( + List.of( + new SystemMessage(legalSystemPrompt), + new UserMessage("What are the key termination clauses and associated penalties?") + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(1000) + .build() + ) +); + +// Subsequent questions reuse cached document - 90% cost savings +ChatResponse obligationAnalysis = chatModel.call( + new Prompt( + List.of( + new SystemMessage(legalSystemPrompt), // Same content - cache hit + new UserMessage("List all financial obligations and payment schedules.") + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(1000) + .build() + ) +); +---- + +==== Batch Code Review + +Process multiple code files with consistent review criteria while caching the review guidelines: + +[source,java] +---- +// Define comprehensive code review guidelines +String reviewGuidelines = """ + You are a senior software engineer conducting code reviews. Apply these criteria: + - Security vulnerabilities and best practices + - Performance optimizations and memory usage + - Code maintainability and readability + - Testing coverage and edge cases + - Design patterns and architecture compliance + """; + +List codeFiles = Arrays.asList( + "UserService.java", "PaymentController.java", "SecurityConfig.java" +); + +List reviews = new ArrayList<>(); + +for (String filename : codeFiles) { + String sourceCode = loadSourceFile(filename); + + ChatResponse review = chatModel.call( + new Prompt( + List.of( + new SystemMessage(reviewGuidelines), // Cached across all reviews + new UserMessage("Review this " + filename + " code:\n\n" + sourceCode) + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(800) + .build() + ) + ); + + reviews.add(review.getResult().getOutput().getText()); +} + +// Guidelines cached after first request, subsequent reviews are faster and cheaper +---- + +==== Customer Support with Knowledge Base + +Create a customer support system that caches your product knowledge base for consistent, accurate responses: + +[source,java] +---- +// Load comprehensive product knowledge +String knowledgeBase = """ + PRODUCT DOCUMENTATION: + - API endpoints and authentication methods + - Common troubleshooting procedures + - Billing and subscription details + - Integration guides and examples + - Known issues and workarounds + """ + loadProductDocs(); // ~2500 tokens + +@Service +public class CustomerSupportService { + + public String handleCustomerQuery(String customerQuery, String customerId) { + ChatResponse response = chatModel.call( + new Prompt( + List.of( + new SystemMessage("You are a helpful customer support agent. " + + "Use this knowledge base to provide accurate solutions: " + knowledgeBase), + new UserMessage("Customer " + customerId + " asks: " + customerQuery) + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_ONLY) + .build()) + .maxTokens(600) + .build() + ) + ); + + return response.getResult().getOutput().getText(); + } +} + +// Knowledge base is cached across all customer queries +// Multiple support agents can benefit from the same cached content +---- + +==== Multi-Tenant SaaS Application + +Cache shared tool definitions across different tenants while customizing system prompts per tenant: + +[source,java] +---- +// Shared tool definitions (cached once, used across all tenants) +List sharedTools = createLargeToolRegistry(); // ~2000 tokens + +// Tenant-specific configuration +@Service +public class MultiTenantAIService { + + public String processRequest(String tenantId, String userQuery) { + // Load tenant-specific system prompt (changes per tenant) + String tenantPrompt = loadTenantSystemPrompt(tenantId); + + ChatResponse response = chatModel.call( + new Prompt( + List.of( + new SystemMessage(tenantPrompt), // Tenant-specific, not cached + new UserMessage(userQuery) + ), + BedrockChatOptions.builder() + .model("us.anthropic.claude-3-7-sonnet-20250219-v1:0") + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.TOOLS_ONLY) + .build()) + .toolCallbacks(sharedTools) // Shared tools - cached + .maxTokens(500) + .build() + ) + ); + + return response.getResult().getOutput().getText(); + } +} + +// Tools cached once, each tenant gets customized system prompt +---- + +=== Best Practices + +1. **Choose the Right Strategy**: + - Use `SYSTEM_ONLY` for reusable system prompts and instructions (works with all models) + - Use `TOOLS_ONLY` when you have large stable tools but dynamic system prompts (Claude only) + - Use `SYSTEM_AND_TOOLS` when both system and tools are large and stable (Claude only) + - Use `CONVERSATION_HISTORY` with ChatClient memory for multi-turn conversations + - Use `NONE` to explicitly disable caching + +2. **Meet Token Requirements**: Focus on caching content that meets the minimum token requirements (1024+ tokens for most models). + +3. **Reuse Identical Content**: Caching works best with exact matches of prompt content. +Even small changes will require a new cache entry. + +4. **Monitor Token Usage**: Track cache effectiveness using the metadata metrics: + + Integer cacheWrite = (Integer) response.getMetadata().getMetadata().get("cacheWriteInputTokens"); + Integer cacheRead = (Integer) response.getMetadata().getMetadata().get("cacheReadInputTokens"); + if (cacheRead != null && cacheRead > 0) { + System.out.println("Cache hit: " + cacheRead + " tokens saved"); + } + +5. **Strategic Cache Placement**: The implementation automatically places cache breakpoints at optimal locations based on your chosen strategy, ensuring compliance with AWS Bedrock's 4-breakpoint limit. + +6. **Cache Lifetime**: AWS Bedrock caches have a fixed 5-minute TTL (Time To Live). +Each cache access resets the timer. + +7. **Model Compatibility**: Be aware of model-specific limitations: + - **Claude models**: Support all caching strategies + - **Amazon Nova models**: Only support `SYSTEM_ONLY` and `CONVERSATION_HISTORY` (tool caching not supported) + +8. **Tool Stability**: When using `TOOLS_ONLY`, `SYSTEM_AND_TOOLS`, or `CONVERSATION_HISTORY` strategies, ensure tools remain stable. +Changing tool definitions will invalidate all downstream cache breakpoints due to cascade invalidation. + +=== Cache Invalidation and Cascade Behavior + +AWS Bedrock follows a hierarchical cache model with cascade invalidation: + +**Cache Hierarchy**: `Tools → System → Messages` + +Changes at each level invalidate that level and all subsequent levels: + +[cols="1,1,1,1", stripes=even] +|==== +| What Changes | Tools Cache | System Cache | Messages Cache + +| Tools | ❌ Invalid | ❌ Invalid | ❌ Invalid +| System | ✅ Valid | ❌ Invalid | ❌ Invalid +| Messages | ✅ Valid | ✅ Valid | ❌ Invalid +|==== + +**Example with `SYSTEM_AND_TOOLS` strategy**: + +[source,java] +---- +// Request 1: Cache both tools and system +ChatResponse r1 = chatModel.call( + new Prompt( + List.of(new SystemMessage("System prompt"), new UserMessage("Question")), + BedrockChatOptions.builder() + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_AND_TOOLS) + .build()) + .toolCallbacks(tools) + .build() + ) +); +// Result: Both caches created + +// Request 2: Change only system prompt (tools same) +ChatResponse r2 = chatModel.call( + new Prompt( + List.of(new SystemMessage("DIFFERENT system prompt"), new UserMessage("Question")), + BedrockChatOptions.builder() + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_AND_TOOLS) + .build()) + .toolCallbacks(tools) // SAME tools + .build() + ) +); +// Result: Tools cache HIT (reused), system cache MISS (recreated) + +// Request 3: Change tools (system same as Request 2) +ChatResponse r3 = chatModel.call( + new Prompt( + List.of(new SystemMessage("DIFFERENT system prompt"), new UserMessage("Question")), + BedrockChatOptions.builder() + .cacheOptions(BedrockCacheOptions.builder() + .strategy(BedrockCacheStrategy.SYSTEM_AND_TOOLS) + .build()) + .toolCallbacks(newTools) // DIFFERENT tools + .build() + ) +); +// Result: BOTH caches MISS (tools change invalidates everything downstream) +---- + +=== Implementation Details + +The prompt caching implementation in Spring AI follows these key design principles: + +1. **Strategic Cache Placement**: Cache breakpoints are automatically placed at optimal locations based on the chosen strategy, ensuring compliance with AWS Bedrock's 4-breakpoint limit. + +2. **Provider Portability**: Cache configuration is done through `BedrockChatOptions` rather than individual messages, preserving compatibility when switching between different AI providers. + +3. **Thread Safety**: The cache breakpoint tracking is implemented with thread-safe mechanisms to handle concurrent requests correctly. + +4. **UNION Type Pattern**: AWS SDK uses UNION types where cache points are added as separate blocks rather than properties. +This is different from direct API approaches but ensures type safety and API compliance. + +5. **Incremental Caching**: The `CONVERSATION_HISTORY` strategy places cache breakpoints on the last user message, enabling incremental caching where each conversation turn builds on the previous cached prefix. + +=== Cost Considerations + +AWS Bedrock pricing for prompt caching (approximate, varies by model): + +* **Cache writes**: ~25% more expensive than base input tokens +* **Cache reads**: ~90% cheaper (only 10% of base input token price) +* **Break-even point**: After just 1 cache read, you've saved money + +**Example cost calculation**: + +[source,java] +---- +// System prompt: 2000 tokens +// User question: 50 tokens + +// Without caching (5 requests): +// Cost: 5 × (2000 + 50) = 10,250 tokens at base rate + +// With caching (5 requests): +// Request 1: 2000 tokens × 1.25 (cache write) + 50 = 2,550 tokens +// Requests 2-5: 4 × (2000 × 0.10 (cache read) + 50) = 4 × 250 = 1,000 tokens +// Total: 2,550 + 1,000 = 3,550 tokens equivalent + +// Savings: (10,250 - 3,550) / 10,250 = 65% cost reduction +---- + == Tool Calling The Bedrock Converse API supports tool calling capabilities, allowing models to use tools during conversations.