feat(openai): add prompt cache key support (#8649)

hntrl · web-flow · commit d534ed758b26 · 2025-08-07T18:23:05.000-07:00
diff --git a/libs/langchain-openai/src/chat_models.ts b/libs/langchain-openai/src/chat_models.ts
@@ -523,6 +523,13 @@ export interface BaseChatOpenAICallOptions
    * Specifies the service tier for prioritization and latency optimization.
    */
   service_tier?: OpenAIClient.Chat.ChatCompletionCreateParams["service_tier"];
+
+  /**
+   * Used by OpenAI to cache responses for similar requests to optimize your cache
+   * hit rates. Replaces the `user` field.
+   * [Learn more](https://platform.openai.com/docs/guides/prompt-caching).
+   */
+  promptCacheKey?: string;
 }
 
 export interface BaseChatOpenAIFields
@@ -618,6 +625,13 @@ export abstract class BaseChatOpenAI<
    */
   service_tier?: OpenAIClient.Chat.ChatCompletionCreateParams["service_tier"];
 
+  /**
+   * Used by OpenAI to cache responses for similar requests to optimize your cache
+   * hit rates.
+   * [Learn more](https://platform.openai.com/docs/guides/prompt-caching).
+   */
+  promptCacheKey: string;
+
   protected defaultOptions: CallOptions;
 
   _llmType() {
@@ -697,6 +711,7 @@ export abstract class BaseChatOpenAI<
       "zdrEnabled",
       "reasoning",
       "verbosity",
+      "promptCacheKey",
     ];
   }
 
@@ -765,6 +780,7 @@ export abstract class BaseChatOpenAI<
     this.reasoning = fields?.reasoning;
     this.maxTokens = fields?.maxCompletionTokens ?? fields?.maxTokens;
     this.disableStreaming = fields?.disableStreaming ?? this.disableStreaming;
+    this.promptCacheKey = fields?.promptCacheKey ?? this.promptCacheKey;
 
     this.streaming = fields?.streaming ?? false;
     if (this.disableStreaming) this.streaming = false;
@@ -1461,6 +1477,7 @@ export class ChatOpenAIResponses<
       })(),
       parallel_tool_calls: options?.parallel_tool_calls,
       max_output_tokens: this.maxTokens === -1 ? undefined : this.maxTokens,
+      prompt_cache_key: options?.promptCacheKey ?? this.promptCacheKey,
       ...(this.zdrEnabled ? { store: false } : {}),
       ...this.modelKwargs,
     };
@@ -2287,6 +2304,7 @@ export class ChatOpenAICompletions<
         ? { modalities: this.modalities || options?.modalities }
         : {}),
       ...this.modelKwargs,
+      prompt_cache_key: options?.promptCacheKey ?? this.promptCacheKey,
     };
     if (options?.prediction !== undefined) {
       params.prediction = options.prediction;
@@ -2621,6 +2639,7 @@ export class ChatOpenAICompletions<
             clientOptions
           );
         } else {
+          console.log("request", request);
           return await this.client.chat.completions.create(
             request,
             clientOptions
diff --git a/libs/langchain-openai/src/tests/chat_models_responses.int.test.ts b/libs/langchain-openai/src/tests/chat_models_responses.int.test.ts
@@ -6,6 +6,7 @@ import {
   BaseMessage,
   BaseMessageChunk,
   HumanMessage,
+  SystemMessage,
   ToolMessage,
   isAIMessage,
   isAIMessageChunk,
@@ -17,6 +18,7 @@ import { randomUUID } from "node:crypto";
 import { ChatOpenAI } from "../chat_models.js";
 import { REASONING_OUTPUT_MESSAGES } from "./data/computer-use-inputs.js";
 import { ChatOpenAIReasoningSummary } from "../types.js";
+import { LONG_PROMPT } from "./data/long-prompt.js";
 
 async function concatStream(stream: Promise<AsyncIterable<AIMessageChunk>>) {
   let full: AIMessageChunk | undefined;
@@ -832,3 +834,45 @@ test("gpt-5", async () => {
   expect(response).toBeDefined();
   console.log(response);
 });
+
+describe("promptCacheKey", () => {
+  test("works as a constructor option", async () => {
+    const model = new ChatOpenAI({
+      model: "gpt-4o-mini",
+      promptCacheKey: "long-prompt-cache-key-1",
+    });
+    const invoke = () =>
+      model.invoke([
+        new SystemMessage(LONG_PROMPT),
+        new HumanMessage("What is the capital of France?"),
+      ]);
+
+    const response = await invoke();
+    expect(response).toBeDefined();
+
+    // follow up turn to make sure that the response is cached
+    const response2 = await invoke();
+    expect(response2).toBeDefined();
+    expect(
+      response2.response_metadata.usage.prompt_tokens_details.cached_tokens
+    ).toBeGreaterThan(0);
+  });
+
+  test("works as a call option", async () => {
+    const model = new ChatOpenAI({ model: "gpt-4o-mini" });
+    const invoke = () =>
+      model.invoke([
+        new SystemMessage(LONG_PROMPT),
+        new HumanMessage("What is the capital of France?"),
+      ]);
+    const response = await invoke();
+    expect(response).toBeDefined();
+
+    // follow up turn to make sure that the response is cached
+    const response2 = await invoke();
+    expect(response2).toBeDefined();
+    expect(
+      response2.response_metadata.usage.prompt_tokens_details.cached_tokens
+    ).toBeGreaterThan(0);
+  });
+});
diff --git a/libs/langchain-openai/src/tests/data/long-prompt.ts b/libs/langchain-openai/src/tests/data/long-prompt.ts
@@ -0,0 +1,66 @@
+/** This is a long prompt used to test prompt caching */
+export const LONG_PROMPT = `
+You are an advanced AI assistant operating under this system prompt. Your objective is to provide accurate, helpful, safe, and efficient assistance across a wide range of tasks and domains. You must follow the instructions below strictly and consistently, prioritizing user goals while maintaining safety, privacy, clarity, and respect.
+
+Role and overarching principles
+- Serve as a reliable, knowledgeable, calm partner for problem solving, explanation, brainstorming, coding, analysis, planning, creative work, research guidance, tutoring, and decision support.
+- Strive for correctness first, then clarity, then efficiency. Avoid unnecessary verbosity unless the user asks for more detail.
+- Be proactive in preventing harm, reducing confusion, and minimizing user effort. Offer clarifying questions when a task is ambiguous, under-specified, or high-risk.
+- Assume positive intent and maintain a professional, respectful, inclusive tone. Adapt to user preferences when stated.
+
+Instruction hierarchy and conflict resolution
+- Follow the highest-priority instruction available. Priority order:
+  1) This system prompt.
+  2) Any developer or platform constraints and tool schemas.
+  3) The latest user instruction that does not conflict with higher-priority instructions.
+  4) Earlier user preferences or instructions, unless superseded.
+- If instructions conflict, briefly explain the conflict and ask how to proceed. Propose a reasonable default path.
+
+Safety, compliance, and refusal
+- Adhere to legal, ethical, and platform safety standards. Do not produce content that is illegal, exploitative hateful, harassing, violent, or that encourages harm to others.
+- Do not provide advice or instructions that meaningfully facilitate wrongdoing, unsafe behavior, evasion of law enforcement, or harm. If asked, refuse briefly and offer safer high-level information or alternatives.
+- For medical, legal, financial, or other professional topics, give general information with a clear disclaimer that you are not a substitute for a professional. Encourage consultation with qualified professionals for important decisions.
+- For cybersecurity, biosecurity, or other dual-use topics, focus on defensive, ethical, and legal practices. Avoid step-by-step instructions that could enable harm.
+
+Privacy and sensitive data handling
+- Do not request or store sensitive personal data unless strictly necessary to fulfill the request. Sensitive data includes financial information, government IDs, passwords, secrets, access tokens, or detailed personal health data.
+- If the user shares sensitive data inadvertently, caution them about privacy and proceed using only what is necessary.
+- Never ask for authentication credentials or secret keys. If an integration requires them, explain safe practices without asking the user to share secrets with you.
+
+Honesty, uncertainty, and factuality
+- If you are unsure, say so. Provide best-effort guidance with uncertainty noted.
+- Avoid hallucinations. Do not invent facts, figures, citations, quotes, or URLs. If you cannot verify, be transparent about limitations and suggest ways to validate or find authoritative sources.
+- Reveal your knowledge cutoff date when relevant. If browsing or tools are available, offer to use them; if not, propose offline strategies.
+- When citing, prefer authoritative, verifiable sources. Provide enough context for the user to validate claims without fabricating links.
+
+Clarifying questions and scoping
+- Ask targeted questions when:
+  - The request is ambiguous or has multiple interpretations.
+  - Critical details are missing and would change the output materially.
+  - There are multiple plausible formats, tools, or constraints to choose from.
+- Otherwise, proceed with reasonable assumptions, briefly state them, and invite correction.
+- For complex tasks, propose a plan or outline first. Offer to iterate in stages.
+
+Response style and formatting
+- Use clear, concise language. Favor bullet lists where helpful for readability.
+- Avoid heavy formatting or complex markup unless explicitly requested. Keep examples short and focused.
+- When returning structured data, use compact, valid JSON if requested or if it improves clarity. Include only requested fields.
+- Define specialized terms on first use for general audiences.
+- Do not reveal hidden chain-of-thought or internal reasoning. Provide concise conclusions or brief justifications. If asked for chain-of-thought, explain you can share a short summary of reasoning but not detailed internal traces.
+
+Limitations, mistakes, and corrections
+- Acknowledge mistakes explicitly and correct them promptly. Explain the correction succinctly.
+- If context or data is missing, state what is missing and propose next steps to obtain it.
+- If a task is infeasible given constraints (time, data, access), explain why and suggest alternatives or partial solutions.
+
+Tools, APIs, and external systems
+- If tool use is available, follow the specified schema exactly. Validate inputs and sanity-check outputs.
+- Never fabricate tool outputs. If a tool fails, times out, or returns unexpected data, state the issue clearly, attempt recovery if possible, or propose alternatives.
+- Translate tool results into user-friendly language, preserving key details and limitations.
+- Avoid sending sensitive data to tools unless necessary and permitted by the user.
+
+Memory and personalization
+- Respect user-declared preferences and remembered context when appropriate. If preferences conflict with new instructions, prioritize the latest instruction.
+- If you are unsure about a remembered preference's relevance, ask briefly before applying it.
+- Do not infer sensitive attributes unless the user explicitly declares them.
+`;
diff --git a/libs/langchain-openai/src/types.ts b/libs/langchain-openai/src/types.ts
@@ -203,6 +203,13 @@ export interface OpenAIChatInput extends OpenAIBaseInput {
    * Specifies the service tier for prioritization and latency optimization.
    */
   service_tier?: OpenAIClient.Responses.ResponseCreateParams["service_tier"];
+
+  /**
+   * Used by OpenAI to cache responses for similar requests to optimize your cache
+   * hit rates. Replaces the `user` field.
+   * [Learn more](https://platform.openai.com/docs/guides/prompt-caching).
+   */
+  promptCacheKey?: string;
 }
 
 export interface AzureOpenAIInput {