Anthropic accepts messages with images (#858)

andreibondarev · github-advanced-security[bot] · web-flow · commit ffe0de320719 · 2024-10-28T21:06:10.000-04:00
* Anthropic accepts messages with images

* Code comments

* Fix code scanning alert no. 4: Use of `Kernel.open` or `IO.read` or similar sinks with a non-constant value

Co-authored-by: Copilot Autofix powered by AI &lt;62310815+github-advanced-security[bot]@users.noreply.github.com&gt;

* Update anthropic_message.rb

* Code comments and CHANGELOG

* Revert to URI.open()

* Fixes

---------

Co-authored-by: Copilot Autofix powered by AI &lt;62310815+github-advanced-security[bot]@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - [SECURITY]: A change which fixes a security vulnerability.
 
 ## [Unreleased]
+- [FEATURE] [https://github.com/patterns-ai-core/langchainrb/pull/858] Assistant, when using Anthropic, now also accepts image_url in the message.
 
 ## [0.19.0] - 2024-10-23
 - [BREAKING] [https://github.com/patterns-ai-core/langchainrb/pull/840] Rename `chat_completion_model_name` parameter to `chat_model` in Langchain::LLM parameters.
diff --git a/lib/langchain/assistant/llm/adapters/anthropic.rb b/lib/langchain/assistant/llm/adapters/anthropic.rb
@@ -38,9 +38,7 @@ def build_chat_params(
           # @param tool_call_id [String] The tool call ID
           # @return [Messages::AnthropicMessage] The Anthropic message
           def build_message(role:, content: nil, image_url: nil, tool_calls: [], tool_call_id: nil)
-            Langchain.logger.warn "WARNING: Image URL is not supported by Anthropic currently" if image_url
-
-            Messages::AnthropicMessage.new(role: role, content: content, tool_calls: tool_calls, tool_call_id: tool_call_id)
+            Messages::AnthropicMessage.new(role: role, content: content, image_url: image_url, tool_calls: tool_calls, tool_call_id: tool_call_id)
           end
 
           # Extract the tool call information from the Anthropic tool call hash
diff --git a/lib/langchain/assistant/messages/anthropic_message.rb b/lib/langchain/assistant/messages/anthropic_message.rb
@@ -18,13 +18,20 @@ class AnthropicMessage < Base
         # @param content [String] The content of the message
         # @param tool_calls [Array<Hash>] The tool calls made in the message
         # @param tool_call_id [String] The ID of the tool call
-        def initialize(role:, content: nil, tool_calls: [], tool_call_id: nil)
+        def initialize(
+          role:,
+          content: nil,
+          image_url: nil,
+          tool_calls: [],
+          tool_call_id: nil
+        )
           raise ArgumentError, "Role must be one of #{ROLES.join(", ")}" unless ROLES.include?(role)
           raise ArgumentError, "Tool calls must be an array of hashes" unless tool_calls.is_a?(Array) && tool_calls.all? { |tool_call| tool_call.is_a?(Hash) }
 
           @role = role
           # Some Tools return content as a JSON hence `.to_s`
           @content = content.to_s
+          @image_url = image_url
           @tool_calls = tool_calls
           @tool_call_id = tool_call_id
         end
@@ -33,25 +40,83 @@ def initialize(role:, content: nil, tool_calls: [], tool_call_id: nil)
         #
         # @return [Hash] The message as an Anthropic API-compatible hash
         def to_hash
-          {}.tap do |h|
-            h[:role] = tool? ? "user" : role
-
-            h[:content] = if tool?
-              [
-                {
-                  type: "tool_result",
-                  tool_use_id: tool_call_id,
-                  content: content
-                }
-              ]
-            elsif tool_calls.any?
-              tool_calls
-            else
-              content
-            end
+          if assistant?
+            assistant_hash
+          elsif tool?
+            tool_hash
+          elsif user?
+            user_hash
           end
         end
 
+        # Convert the message to an Anthropic API-compatible hash
+        #
+        # @return [Hash] The message as an Anthropic API-compatible hash, with the role as "assistant"
+        def assistant_hash
+          {
+            role: "assistant",
+            content: if tool_calls.any?
+                       tool_calls
+                     else
+                       content
+                     end
+          }
+        end
+
+        # Convert the message to an Anthropic API-compatible hash
+        #
+        # @return [Hash] The message as an Anthropic API-compatible hash, with the role as "user"
+        def tool_hash
+          {
+            role: "user",
+            # TODO: Tool can also return images
+            # https://docs.anthropic.com/en/docs/build-with-claude/tool-use#handling-tool-use-and-tool-result-content-blocks
+            content: [
+              {
+                type: "tool_result",
+                tool_use_id: tool_call_id,
+                content: content
+              }
+            ]
+          }
+        end
+
+        # Convert the message to an Anthropic API-compatible hash
+        #
+        # @return [Hash] The message as an Anthropic API-compatible hash, with the role as "user"
+        def user_hash
+          {
+            role: "user",
+            content: build_content_array
+          }
+        end
+
+        # Builds the content value for the message hash
+        # @return [Array<Hash>] An array of content hashes
+        def build_content_array
+          content_details = []
+
+          if content && !content.empty?
+            content_details << {
+              type: "text",
+              text: content
+            }
+          end
+
+          if image
+            content_details << {
+              type: "image",
+              source: {
+                type: "base64",
+                data: image.base64,
+                media_type: image.mime_type
+              }
+            }
+          end
+
+          content_details
+        end
+
         # Check if the message is a tool call
         #
         # @return [Boolean] true/false whether this message is a tool call
diff --git a/lib/langchain/assistant/messages/base.rb b/lib/langchain/assistant/messages/base.rb
@@ -50,6 +50,10 @@ def standard_role
           # TODO: Should we return :unknown or raise an error?
           :unknown
         end
+
+        def image
+          image_url ? Utils::ImageWrapper.new(image_url) : nil
+        end
       end
     end
   end
diff --git a/lib/langchain/utils/image_wrapper.rb b/lib/langchain/utils/image_wrapper.rb
@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+
+require "open-uri"
+
+module Langchain
+  module Utils
+    class ImageWrapper
+      attr_reader :image_url
+
+      def initialize(image_url)
+        @image_url = image_url
+      end
+
+      def base64
+        @base64 ||= begin
+          image_data = open_image.read
+          Base64.strict_encode64(image_data)
+        end
+      end
+
+      def mime_type
+        # TODO: Make it work with local files
+        open_image.meta["content-type"]
+      end
+
+      private
+
+      def open_image
+        # TODO: Make it work with local files
+        uri = URI.parse(image_url)
+        raise URI::InvalidURIError, "Invalid URL scheme" unless %w[http https].include?(uri.scheme)
+        @open_image ||= URI.open(image_url) # rubocop:disable Security/Open
+      end
+    end
+  end
+end
diff --git a/spec/fixtures/loaders/sf-cable-car.jpeg b/spec/fixtures/loaders/sf-cable-car.jpeg
diff --git a/spec/langchain/assistant/assistant_spec.rb b/spec/langchain/assistant/assistant_spec.rb
@@ -1146,7 +1146,7 @@
         before do
           allow(subject.llm).to receive(:chat)
             .with(
-              messages: [{role: "user", content: "Please calculate 2+2"}],
+              messages: [{role: "user", content: [{text: "Please calculate 2+2", type: "text"}]}],
               tools: calculator.class.function_schemas.to_anthropic_format,
               tool_choice: {disable_parallel_tool_use: false, type: "auto"},
               system: instructions
@@ -1191,7 +1191,7 @@
           allow(subject.llm).to receive(:chat)
             .with(
               messages: [
-                {role: "user", content: "Please calculate 2+2"},
+                {role: "user", content: [{text: "Please calculate 2+2", type: "text"}]},
                 {role: "assistant", content: [
                   {
                     "type" => "tool_use",
diff --git a/spec/langchain/assistant/messages/anthropic_message_spec.rb b/spec/langchain/assistant/messages/anthropic_message_spec.rb
@@ -6,53 +6,107 @@
   end
 
   describe "#to_hash" do
-    it "returns function" do
-      message = described_class.new(role: "tool_result", content: "4.0", tool_call_id: "toolu_014eSx9oBA5DMe8gZqaqcJ3H")
-      expect(message.to_hash).to eq(
-        {
-          role: "user",
+    context "when role is assistant" do
+      let(:role) { "assistant" }
+
+      it "returns assistant_hash" do
+        message = described_class.new(role: role, content: "Hello, how can I help you?")
+        expect(message).to receive(:assistant_hash).and_call_original
+        expect(message.to_hash).to eq(
+          role: role,
+          content: "Hello, how can I help you?"
+        )
+      end
+
+      it "returns assistant_hash with tool_calls" do
+        message = described_class.new(
+          role: role,
+          tool_calls: [
+            {
+              "type" => "tool_use",
+              "id" => "toolu_01UEciZACvRZ6S4rqAwD1syH",
+              "name" => "news_retriever__get_everything",
+              "input" => {
+                "q" => "Google I/O 2024",
+                "sort_by" => "publishedAt",
+                "language" => "en"
+              }
+            }
+          ]
+        )
+        expect(message.to_hash).to eq(
+          role: role,
           content: [
             {
-              type: "tool_result",
-              tool_use_id: "toolu_014eSx9oBA5DMe8gZqaqcJ3H",
-              content: "4.0"
+              "type" => "tool_use",
+              "id" => "toolu_01UEciZACvRZ6S4rqAwD1syH",
+              "name" => "news_retriever__get_everything",
+              "input" => {
+                "q" => "Google I/O 2024",
+                "sort_by" => "publishedAt",
+                "language" => "en"
+              }
             }
           ]
-        }
-      )
+        )
+      end
     end
 
-    it "returns tool_calls" do
-      message = described_class.new(
-        role: "assistant",
-        tool_calls: [
+    context "when role is tool_result" do
+      let(:message) { described_class.new(role: "tool_result", content: "4.0", tool_call_id: "toolu_014eSx9oBA5DMe8gZqaqcJ3H") }
+
+      it "returns tool_hash" do
+        expect(message).to receive(:tool_hash).and_call_original
+        expect(message.to_hash).to eq(
           {
-            "type" => "tool_use",
-            "id" => "toolu_01UEciZACvRZ6S4rqAwD1syH",
-            "name" => "news_retriever__get_everything",
-            "input" => {
-              "q" => "Google I/O 2024",
-              "sort_by" => "publishedAt",
-              "language" => "en"
-            }
+            role: "user",
+            content: [
+              {
+                type: "tool_result",
+                tool_use_id: "toolu_014eSx9oBA5DMe8gZqaqcJ3H",
+                content: "4.0"
+              }
+            ]
           }
-        ]
-      )
-      expect(message.to_hash).to eq(
-        role: "assistant",
-        content: [
-          {
-            "type" => "tool_use",
-            "id" => "toolu_01UEciZACvRZ6S4rqAwD1syH",
-            "name" => "news_retriever__get_everything",
-            "input" => {
-              "q" => "Google I/O 2024",
-              "sort_by" => "publishedAt",
-              "language" => "en"
+        )
+      end
+    end
+
+    context "when role is user" do
+      let(:role) { "user" }
+
+      it "returns user_hash" do
+        message = described_class.new(role: role, content: "Hello, how can I help you?")
+        expect(message).to receive(:user_hash).and_call_original
+        expect(message.to_hash).to eq(
+          role: role,
+          content: [
+            {
+              type: "text",
+              text: "Hello, how can I help you?"
             }
-          }
-        ]
-      )
+          ]
+        )
+      end
+
+      it "returns user_hash with image_url" do
+        message = described_class.new(role: role, image_url: "https://example.com/image.jpg")
+        allow(message).to receive(:image).and_return(double(base64: "base64_data", mime_type: "image/jpeg"))
+
+        expect(message.to_hash).to eq(
+          role: role,
+          content: [
+            {
+              type: "image",
+              source: {
+                type: "base64",
+                data: "base64_data",
+                media_type: "image/jpeg"
+              }
+            }
+          ]
+        )
+      end
     end
   end
 end
diff --git a/spec/langchain/assistant/messages/base_spec.rb b/spec/langchain/assistant/messages/base_spec.rb
@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+
+RSpec.describe Langchain::Assistant::Messages::Base do
+  describe "tool?" do
+    it "raises an error" do
+      expect { described_class.new.tool? }.to raise_error(NotImplementedError)
+    end
+  end
+
+  describe "system?" do
+    it "raises an error" do
+      expect { described_class.new.system? }.to raise_error(NotImplementedError)
+    end
+  end
+
+  describe "llm?" do
+    it "raises an error" do
+      expect { described_class.new.llm? }.to raise_error(NotImplementedError)
+    end
+  end
+end
diff --git a/spec/langchain/utils/image_wrapper_spec.rb b/spec/langchain/utils/image_wrapper_spec.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: true
+
+RSpec.describe Langchain::Utils::ImageWrapper do
+  let(:image_url) { "https://example.com/sf-cable-car.jpeg" }
+
+  before do
+    allow(URI).to receive(:open).with(image_url).and_return(File.open("./spec/fixtures/loaders/sf-cable-car.jpeg"))
+  end
+
+  subject { described_class.new(image_url) }
+
+  describe "#base64" do
+    it "returns the image as a base64 string" do
+      expect(subject.base64).to eq(Base64.strict_encode64(File.read("./spec/fixtures/loaders/sf-cable-car.jpeg")))
+    end
+  end
+
+  xdescribe "#mime_type" do
+    it "returns the mime type of the image" do
+      expect(subject.mime_type).to eq("image/jpeg")
+    end
+  end
+end