fix(core): preserve multimodal content in getBufferString as placeholders (#10424)

pawel-twardziak · web-flow · commit d69dfcca9750 · 2026-03-16T13:42:56.000-07:00
diff --git a/.changeset/warm-planes-glow.md b/.changeset/warm-planes-glow.md
@@ -0,0 +1,5 @@
+---
+"@langchain/core": patch
+---
+
+fix(core): preserve multimodal content in getBufferString as placeholders
diff --git a/libs/langchain-core/src/messages/tests/message_utils.test.ts b/libs/langchain-core/src/messages/tests/message_utils.test.ts
@@ -624,8 +624,8 @@ test("getBufferString can handle complex messages", () => {
   expect(bufferString2).toBe("AI: Hello there!");
 
   const bufferString3 = getBufferString(messageArr3);
-  // Image content should return empty string for text property
-  expect(bufferString3).toBe("Human: ");
+  // Image-only content should produce placeholders, not empty string
+  expect(bufferString3).toBe("Human: [image][image]");
 });
 
 test("getBufferString includes tool_calls for AI messages", () => {
@@ -700,6 +700,75 @@ test("getBufferString uses text property to avoid metadata inflation", () => {
   expect(bufferString).not.toContain("additional_kwargs");
 });
 
+test("getBufferString preserves non-text content block placeholders", () => {
+  // image and image_url -> [image]
+  const imgMsg = new HumanMessage({
+    content: [
+      { type: "image", source: { type: "base64", data: "abc" } },
+      {
+        type: "image_url",
+        image_url: { url: "https://example.com/img.png" },
+      },
+    ],
+  });
+  expect(getBufferString([imgMsg])).toBe("Human: [image][image]");
+
+  // audio and input_audio -> [audio]
+  const audioMsg = new HumanMessage({
+    content: [
+      { type: "audio", source: { type: "base64", data: "abc" } },
+      { type: "input_audio", data: "abc", format: "wav" },
+    ],
+  });
+  expect(getBufferString([audioMsg])).toBe("Human: [audio][audio]");
+
+  // video -> [video]
+  const videoMsg = new HumanMessage({
+    content: [{ type: "video", source: { type: "base64", data: "abc" } }],
+  });
+  expect(getBufferString([videoMsg])).toBe("Human: [video]");
+
+  // file -> [file]
+  const fileMsg = new HumanMessage({
+    content: [
+      {
+        type: "file",
+        source: { type: "base64", data: "abc" },
+        mimeType: "application/pdf",
+      },
+    ],
+  });
+  expect(getBufferString([fileMsg])).toBe("Human: [file]");
+
+  // text-plain -> extracts text
+  const textPlainMsg = new HumanMessage({
+    content: [{ type: "text-plain", text: "hello world" }],
+  });
+  expect(getBufferString([textPlainMsg])).toBe("Human: hello world");
+
+  // reasoning -> excluded (empty string, filtered out)
+  const reasoningMsg = new AIMessage({
+    content: [
+      { type: "reasoning", reasoning: "thinking..." },
+      { type: "text", text: "answer" },
+    ],
+  });
+  expect(getBufferString([reasoningMsg])).toBe("AI: answer");
+
+  // Mixed content: text + multimodal
+  const mixedMsg = new HumanMessage({
+    content: [
+      { type: "text", text: "Look at this: " },
+      { type: "image", source: { type: "base64", data: "abc" } },
+      { type: "text", text: " and listen to this: " },
+      { type: "audio", source: { type: "base64", data: "def" } },
+    ],
+  });
+  expect(getBufferString([mixedMsg])).toBe(
+    "Human: Look at this: [image] and listen to this: [audio]"
+  );
+});
+
 describe("chat message conversions", () => {
   it("can convert a chat message to a stored message and back", () => {
     const originalMessages = [
diff --git a/libs/langchain-core/src/messages/utils.ts b/libs/langchain-core/src/messages/utils.ts
@@ -305,6 +305,45 @@ export function coerceMessageLikeToMessage(
   }
 }
 
+/**
+ * Renders a single content block to a compact string representation.
+ * Text blocks are returned as-is; multimodal blocks (image, audio, video, file)
+ * become short placeholders like `[image]` so their existence is preserved
+ * without inflating token counts with base64 data or metadata.
+ */
+function _contentBlockToString(
+  block: string | { type?: string; [key: string]: unknown }
+): string {
+  if (typeof block === "string") return block;
+  switch (block.type) {
+    case "text":
+      return (block as { text: string }).text ?? "";
+    case "text-plain":
+      return (block as { text?: string }).text ?? "[text-plain file]";
+    case "image":
+    case "image_url":
+      return "[image]";
+    case "audio":
+    case "input_audio":
+      return "[audio]";
+    case "video":
+      return "[video]";
+    case "file":
+      return "[file]";
+    case "reasoning":
+    case "tool_call":
+    case "tool_call_chunk":
+    case "invalid_tool_call":
+    case "server_tool_call":
+    case "server_tool_call_chunk":
+    case "server_tool_call_result":
+    case "non_standard":
+      return "";
+    default:
+      return block.type ? `[${block.type}]` : "";
+  }
+}
+
 /**
  * This function is used by memory classes to get a string representation
  * of the chat message history, based on the message content and role.
@@ -341,9 +380,13 @@ export function getBufferString(
     }
     const nameStr = m.name ? `${m.name}, ` : "";
 
-    // Use m.text property which extracts only text content, avoiding metadata
-    // For non-string content (e.g., content blocks), m.text extracts only text blocks
-    const readableContent = m.text;
+    // Render content compactly: text as-is, multimodal blocks as placeholders
+    const readableContent =
+      typeof m.content === "string"
+        ? m.content
+        : Array.isArray(m.content)
+          ? m.content.map(_contentBlockToString).filter(Boolean).join("")
+          : "";
 
     let message = `${role}: ${nameStr}${readableContent}`;
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@langchain/core": patch
 +---
++
 +fix(core): preserve multimodal content in getBufferString as placeholders