diff --git a/examples/generate_image.rb b/examples/generate_image.rb new file mode 100644 index 000000000..ff75e2ea3 --- /dev/null +++ b/examples/generate_image.rb @@ -0,0 +1,118 @@ +# frozen_string_literal: true + +# Example demonstrating Langchain's unified image generation API across providers +# +# Prerequisites (set any of these): +# export OPENAI_API_KEY="your_api_key" +# export GOOGLE_GEMINI_API_KEY="your_api_key" +# export GOOGLE_VERTEX_AI_PROJECT_ID="your_project_id" +# +# Run with: +# bundle exec ruby examples/generate_image.rb + +require "bundler/inline" + +# Ensure dependencies for a standalone execution outside of gem install +# This will be skipped if already present in the main Gemfile +gemfile(true) do + source "https://rubygems.org" + gem "ruby-openai", ">= 6.3" + gem "googleauth" # For Google Vertex AI + gem "langchainrb", path: File.expand_path("..", __dir__) +end + +require "langchainrb" +require "base64" + +# Build array of available LLM providers based on environment variables +llms = [] + +# OpenAI +if ENV["OPENAI_API_KEY"] + llms << { + name: "OpenAI DALL-E 3", + instance: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"]), + options: {size: "1024x1024"} + } +end + +# Google Gemini +if ENV["GOOGLE_GEMINI_API_KEY"] + llms << { + name: "Google Gemini", + instance: Langchain::LLM::GoogleGemini.new(api_key: ENV["GOOGLE_GEMINI_API_KEY"]), + options: {n: 1} + } +end + +# Google Vertex AI +if ENV["GOOGLE_VERTEX_AI_PROJECT_ID"] + region = ENV.fetch("GOOGLE_VERTEX_AI_REGION", "us-central1") + llms << { + name: "Google Vertex AI (Imagen)", + instance: Langchain::LLM::GoogleVertexAI.new( + project_id: ENV["GOOGLE_VERTEX_AI_PROJECT_ID"], + region: region + ), + options: {n: 1} + } +end + +if llms.empty? + puts "No LLM providers configured. Please set at least one of:" + puts " - OPENAI_API_KEY" + puts " - GOOGLE_GEMINI_API_KEY" + puts " - GOOGLE_VERTEX_AI_PROJECT_ID" + exit 1 +end + +# Common prompt for all providers +PROMPT = "A minimalist illustration of a ruby gemstone on a dark background" + +puts "Generating images with prompt: \"#{PROMPT}\"" +puts "Using #{llms.length} provider(s)" +puts + +# Demonstrate unified API - same method call works across all providers +llms.each do |llm_config| + puts "=== #{llm_config[:name]} ===" + + begin + # Unified API call - works the same for all providers + response = llm_config[:instance].generate_image( + prompt: PROMPT, + **llm_config[:options] + ) + + # Handle different response formats + if response.respond_to?(:image_urls) && !response.image_urls.empty? + puts "✓ Generated #{response.image_urls.count} image(s)" + response.image_urls.each_with_index do |url, i| + puts " Image #{i + 1} URL: #{url}" + end + elsif response.respond_to?(:image_base64s) && !response.image_base64s.empty? + puts "✓ Generated #{response.image_base64s.count} image(s)" + response.image_base64s.each_with_index do |data, i| + filename = "#{llm_config[:name].downcase.gsub(/\s+/, '_')}_image_#{i + 1}.png" + begin + decoded_data = Base64.decode64(data) + File.binwrite(filename, decoded_data) + puts " Image #{i + 1}: Saved to #{filename} (#{decoded_data.bytesize} bytes)" + rescue => e + puts " Image #{i + 1}: Base64 data received (#{data.length} chars) - error saving: #{e.message}" + end + end + else + puts "✗ No images in response" + end + rescue => e + puts "✗ Error: #{e.message}" + end + + puts +end + +puts "Summary:" +puts "- All providers use the same `generate_image` method" +puts "- Responses provide either `image_urls` or `image_base64s`" +puts "- This unified API makes it easy to switch between providers" \ No newline at end of file diff --git a/lib/langchain/llm/base.rb b/lib/langchain/llm/base.rb index e6b1f4d5d..e02dd7d3c 100644 --- a/lib/langchain/llm/base.rb +++ b/lib/langchain/llm/base.rb @@ -77,6 +77,15 @@ def summarize(...) raise NotImplementedError, "#{self.class.name} does not support summarization" end + # + # Generate an image for a given prompt. Parameters will depend on the LLM provider. + # + # @raise NotImplementedError if not supported by the LLM + # + def generate_image(...) + raise NotImplementedError, "#{self.class.name} does not support image generation" + end + # # Returns an instance of Langchain::LLM::Parameters::Chat # diff --git a/lib/langchain/llm/google_gemini.rb b/lib/langchain/llm/google_gemini.rb index 5fc204e80..54b717906 100644 --- a/lib/langchain/llm/google_gemini.rb +++ b/lib/langchain/llm/google_gemini.rb @@ -7,6 +7,7 @@ class GoogleGemini < Base DEFAULTS = { chat_model: "gemini-1.5-pro-latest", embedding_model: "text-embedding-004", + image_generation_model: "gemini-2.0-flash-preview-image-generation", temperature: 0.0 } @@ -91,6 +92,30 @@ def embed( Langchain::LLM::Response::GoogleGeminiResponse.new(parsed_response, model: model) end + # Generate an image for a given prompt using Gemini Image Generation capability + # + # @param prompt [String] The textual prompt for the desired image + # @param n [Integer] Number of images to generate (candidateCount) (default 1) + # @return [Langchain::LLM::Response::GoogleGeminiResponse] Response wrapper + def generate_image(prompt:, n: 1) + raise ArgumentError.new("prompt argument is required") if prompt.to_s.strip.empty? + + parameters = { + contents: [{parts: [{text: prompt}]}], + generationConfig: { + responseModalities: ["TEXT", "IMAGE"], + candidateCount: n + } + } + + model = @defaults[:image_generation_model] + uri = URI("https://generativelanguage.googleapis.com/v1beta/models/#{model}:generateContent?key=#{api_key}") + + parsed_response = http_post(uri, parameters) + + Langchain::LLM::Response::GoogleGeminiResponse.new(parsed_response, model: model) + end + private def http_post(url, params) diff --git a/lib/langchain/llm/google_vertexai.rb b/lib/langchain/llm/google_vertexai.rb index a59ba95dd..303a5a44b 100644 --- a/lib/langchain/llm/google_vertexai.rb +++ b/lib/langchain/llm/google_vertexai.rb @@ -18,7 +18,8 @@ class GoogleVertexAI < Base top_k: 40, dimensions: 768, embedding_model: "textembedding-gecko", - chat_model: "gemini-1.0-pro" + chat_model: "gemini-1.0-pro", + image_generation_model: "imagen-3.0-generate-002" }.freeze # Google Cloud has a project id and a specific region of deployment. @@ -99,6 +100,33 @@ def chat(params = {}) end end + # Generate images with Imagen model via Vertex AI + # + # @param prompt [String] The text prompt for the image + # @param n [Integer] Number of images to generate (1-4) + # @return [Langchain::LLM::Response::GoogleVertexAIResponse] + def generate_image(prompt:, n: 1) + raise ArgumentError.new("prompt argument is required") if prompt.to_s.strip.empty? + + params = { + instances: [ + { + prompt: prompt + } + ], + parameters: { + sampleCount: n + } + } + + model = @defaults[:image_generation_model] + uri = URI("#{url}#{model}:predict") + + parsed_response = http_post(uri, params) + + Langchain::LLM::Response::GoogleVertexAIResponse.new(parsed_response, model: model) + end + private def http_post(url, params) diff --git a/lib/langchain/llm/openai.rb b/lib/langchain/llm/openai.rb index add1eacbc..7386a2564 100644 --- a/lib/langchain/llm/openai.rb +++ b/lib/langchain/llm/openai.rb @@ -18,7 +18,8 @@ class OpenAI < Base DEFAULTS = { n: 1, chat_model: "gpt-4o-mini", - embedding_model: "text-embedding-3-small" + embedding_model: "text-embedding-3-small", + image_generation_model: "dall-e-3" }.freeze EMBEDDING_SIZES = { @@ -161,6 +162,29 @@ def summarize(text:) complete(prompt: prompt) end + # Generate images for a given prompt using OpenAI Images API + # + # @param prompt [String] Textual prompt describing the desired image + # @param n [Integer] Number of images to generate (default 1) + # @param size [String] Requested resolution, eg. "1024x1024" (default "1024x1024") + # @return [Langchain::LLM::Response::OpenAIResponse] Wrapper around the raw response + def generate_image(prompt:, n: 1, size: "1024x1024") + raise ArgumentError, "prompt argument is required" if prompt.to_s.strip.empty? + + parameters = { + prompt: prompt, + n: n, + size: size, + model: @defaults[:image_generation_model] + } + + response = with_api_error_handling do + client.images.generate(parameters: parameters) + end + + Langchain::LLM::Response::OpenAIResponse.new(response) + end + def default_dimensions @defaults[:dimensions] || EMBEDDING_SIZES.fetch(defaults[:embedding_model]) end diff --git a/lib/langchain/llm/response/google_gemini_response.rb b/lib/langchain/llm/response/google_gemini_response.rb index 3a974d663..0a4a590e0 100644 --- a/lib/langchain/llm/response/google_gemini_response.rb +++ b/lib/langchain/llm/response/google_gemini_response.rb @@ -45,5 +45,16 @@ def completion_tokens def total_tokens raw_response.dig("usageMetadata", "totalTokenCount") end + + # Returns array of base64 image data from inline_data parts + def image_base64s + candidates = raw_response["candidates"] || [] + candidates.flat_map do |candidate| + parts = candidate.dig("content", "parts") || [] + parts.filter_map { |part| part.dig("inlineData", "data") } + end + end + + alias_method :image_blobs, :image_base64s end end diff --git a/lib/langchain/llm/response/google_vertex_ai_response.rb b/lib/langchain/llm/response/google_vertex_ai_response.rb new file mode 100644 index 000000000..05d38dd7f --- /dev/null +++ b/lib/langchain/llm/response/google_vertex_ai_response.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module Langchain::LLM::Response + class GoogleVertexAIResponse < BaseResponse + # Imagen responses place image bytes in predictions list + # Each prediction may include {"bytes": "BASE64"} or nested keys. + def image_base64s + Array(raw_response["predictions"]).map do |pred| + pred["bytes"] || pred.dig("image", "image_bytes") || pred.dig("image", "imageBytes") + end.compact + end + + alias_method :image_blobs, :image_base64s + + # Other methods not supported for image response + def chat_completion; nil; end + def embedding; nil; end + def embeddings; []; end + def prompt_tokens; nil; end + def completion_tokens; nil; end + def total_tokens; nil; end + end +end \ No newline at end of file diff --git a/lib/langchain/llm/response/openai_response.rb b/lib/langchain/llm/response/openai_response.rb index d47936d2a..d21adeca2 100644 --- a/lib/langchain/llm/response/openai_response.rb +++ b/lib/langchain/llm/response/openai_response.rb @@ -59,5 +59,12 @@ def completion_tokens def total_tokens raw_response.dig("usage", "total_tokens") end + + # Returns an array of image URLs when the response comes from the Image Generation endpoint + # + # @return [Array] list of image URLs or [] if not present + def image_urls + Array(raw_response.dig("data")).map { |d| d["url"] }.compact + end end end diff --git a/spec/lib/langchain/llm/base_spec.rb b/spec/lib/langchain/llm/base_spec.rb index df3f8c96f..11b792f9b 100644 --- a/spec/lib/langchain/llm/base_spec.rb +++ b/spec/lib/langchain/llm/base_spec.rb @@ -48,6 +48,12 @@ def initialize end end + describe "#generate_image" do + it "raises an error" do + expect { subject.generate_image }.to raise_error(NotImplementedError) + end + end + describe "#chat_parameters(params = {})" do subject { TestLLM.new } diff --git a/spec/lib/langchain/llm/google_gemini_image_spec.rb b/spec/lib/langchain/llm/google_gemini_image_spec.rb new file mode 100644 index 000000000..95dfc9620 --- /dev/null +++ b/spec/lib/langchain/llm/google_gemini_image_spec.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +RSpec.describe Langchain::LLM::GoogleGemini do + let(:subject) { described_class.new(api_key: "XYZ") } + + describe "#generate_image" do + let(:prompt) { "Generate a minimalistic landscape" } + let(:model_id) { "gemini-2.0-flash-preview-image-generation" } + let(:uri) { URI("https://generativelanguage.googleapis.com/v1beta/models/#{model_id}:generateContent?key=XYZ") } + let(:params) do + { + contents: [{parts: [{text: prompt}]}], + generationConfig: {responseModalities: ["IMAGE"], candidateCount: 1} + } + end + let(:api_response) do + {"candidates" => [{"content" => {"parts" => [{"inline_data" => {"data" => "BASE64STRING"}}]}}]} + end + + before do + http_response = double("response", body: api_response.to_json) + http = double("http") + allow(http).to receive(:use_ssl=) + allow(http).to receive(:set_debug_output) + allow(http).to receive(:request).and_return(http_response) + allow(Net::HTTP).to receive(:new).and_return(http) + end + + it "returns a response wrapper" do + resp = subject.generate_image(prompt: prompt) + expect(resp).to be_a(Langchain::LLM::Response::GoogleGeminiResponse) + end + end +end \ No newline at end of file diff --git a/spec/lib/langchain/llm/google_vertexai_image_spec.rb b/spec/lib/langchain/llm/google_vertexai_image_spec.rb new file mode 100644 index 000000000..8719b482e --- /dev/null +++ b/spec/lib/langchain/llm/google_vertexai_image_spec.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +require "googleauth" +require_relative "#{Langchain.root}/langchain/llm/response/google_vertex_ai_response" + +RSpec.describe Langchain::LLM::GoogleVertexAI do + let(:subject) { described_class.new(project_id: "proj", region: "us-central1") } + + before do + allow(Google::Auth).to receive(:get_application_default).and_return( + double("Google::Auth::UserRefreshCredentials", fetch_access_token!: {access_token: 123}) + ) + end + + describe "#generate_image" do + let(:prompt) { "A cartoon cat" } + let(:model) { "imagen-3.0-generate-002" } + let(:uri) { URI("#{subject.url}#{model}:predict") } + let(:params) { {instances: [{prompt: prompt}], parameters: {sampleCount: 1}} } + let(:api_response) { {"predictions" => [{"bytes" => "BASE64IMG"}]} } + + before do + allow_any_instance_of(Net::HTTP).to receive(:request).and_return(double(body: api_response.to_json)) + end + + it "returns wrapper with base64s" do + resp = subject.generate_image(prompt: prompt) + expect(resp).to be_a(Langchain::LLM::Response::GoogleVertexAIResponse) + expect(resp.image_base64s).to eq(["BASE64IMG"]) + end + end +end \ No newline at end of file diff --git a/spec/lib/langchain/llm/openai_image_spec.rb b/spec/lib/langchain/llm/openai_image_spec.rb new file mode 100644 index 000000000..87f90462c --- /dev/null +++ b/spec/lib/langchain/llm/openai_image_spec.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require "openai" + +RSpec.describe Langchain::LLM::OpenAI do + let(:subject) { described_class.new(api_key: "123") } + + describe "#generate_image" do + let(:prompt) { "A cute baby sea otter" } + let(:parameters) { {parameters: {prompt: prompt, n: 1, size: "1024x1024", model: "dall-e-3"}} } + let(:openai_response) { {"created" => 1_721_918_400, "data" => [{"url" => "https://example.com/otter.png"}]} } + + before do + images_stub = double("images") + allow(subject.client).to receive(:images).and_return(images_stub) + allow(images_stub).to receive(:generate).with(parameters).and_return(openai_response) + end + + it "returns an OpenAIResponse with image URLs" do + response = subject.generate_image(prompt: prompt) + + expect(response).to be_a(Langchain::LLM::Response::OpenAIResponse) + expect(response.image_urls).to eq(["https://example.com/otter.png"]) + end + end +end \ No newline at end of file