discourse · SamSaffron · Feb 17, 2025 · Feb 16, 2025 · Feb 16, 2025 · Feb 16, 2025
diff --git a/app/controllers/discourse_ai/admin/ai_personas_controller.rb b/app/controllers/discourse_ai/admin/ai_personas_controller.rb
@@ -41,7 +41,7 @@ def index
                    tools: tools,
                    llms: llms,
                    settings: {
-                     rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
+                     rag_images_enabled: SiteSetting.ai_rag_images_enabled,
                    },
                  },
                }

diff --git a/app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb b/app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb
@@ -48,8 +48,8 @@ def upload_file
 
       def validate_extension!(filename)
         extension = File.extname(filename)[1..-1] || ""
-        authorized_extensions = %w[txt md]
-        authorized_extensions.concat(%w[pdf png jpg jpeg]) if SiteSetting.ai_rag_pdf_images_enabled
+        authorized_extensions = %w[txt md pdf]
+        authorized_extensions.concat(%w[png jpg jpeg]) if SiteSetting.ai_rag_images_enabled
         if !authorized_extensions.include?(extension)
           raise Discourse::InvalidParameters.new(
                   I18n.t(

diff --git a/app/jobs/regular/digest_rag_upload.rb b/app/jobs/regular/digest_rag_upload.rb
@@ -164,22 +164,16 @@ def first_chunk(text, chunk_tokens:, tokenizer:, splitters: ["\n\n", "\n", ".",
     end
 
     def get_uploaded_file(upload:, target:)
-      if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled
+      if %w[png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_images_enabled
         raise Discourse::InvalidAccess.new(
-                "The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.",
+                "The setting ai_rag_images_enabled is false, can not index images",
               )
       end
       if upload.extension == "pdf"
-        pages =
-          DiscourseAi::Utils::PdfToImages.new(
-            upload: upload,
-            user: Discourse.system_user,
-          ).uploaded_pages
-
         return(
-          DiscourseAi::Utils::ImageToText.as_fake_file(
-            uploads: pages,
-            llm_model: target.rag_llm_model,
+          DiscourseAi::Utils::PdfToText.as_fake_file(
+            upload: upload,
+            llm_model: SiteSetting.ai_rag_images_enabled ? target.rag_llm_model : nil,
             user: Discourse.system_user,
           )
         )

diff --git a/app/serializers/ai_custom_tool_list_serializer.rb b/app/serializers/ai_custom_tool_list_serializer.rb
@@ -10,7 +10,7 @@ def meta
       presets: AiTool.presets,
       llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization,
       settings: {
-        rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
+        rag_images_enabled: SiteSetting.ai_rag_images_enabled,
       },
     }
   end

diff --git a/assets/javascripts/discourse/components/ai-persona-editor.gjs b/assets/javascripts/discourse/components/ai-persona-editor.gjs
@@ -596,13 +596,13 @@ export default class PersonaEditor extends Component {
             @target={{this.editingModel}}
             @updateUploads={{this.updateUploads}}
             @onRemove={{this.removeUpload}}
-            @allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
+            @allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
           />
         </div>
         <RagOptions
           @model={{this.editingModel}}
           @llms={{@personas.resultSetMeta.llms}}
-          @allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
+          @allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
         >
           <div class="control-group">
             <label>{{i18n

diff --git a/assets/javascripts/discourse/components/ai-tool-editor.gjs b/assets/javascripts/discourse/components/ai-tool-editor.gjs
@@ -245,13 +245,13 @@ export default class AiToolEditor extends Component {
               @target={{this.editingModel}}
               @updateUploads={{this.updateUploads}}
               @onRemove={{this.removeUpload}}
-              @allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
+              @allowImages={{@settings.rag_images_enabled}}
             />
           </div>
           <RagOptions
             @model={{this.editingModel}}
             @llms={{@llms}}
-            @allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
+            @allowImages={{@settings.rag_images_enabled}}
           />
         {{/if}}
 

diff --git a/assets/javascripts/discourse/components/rag-options.gjs b/assets/javascripts/discourse/components/rag-options.gjs
@@ -81,7 +81,7 @@ export default class RagOptions extends Component {
           }}
         />
       </div>
-      {{#if @allowPdfsAndImages}}
+      {{#if @allowImages}}
         <div class="control-group">
           <label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label>
           <AiLlmSelector

diff --git a/assets/javascripts/discourse/components/rag-uploader.gjs b/assets/javascripts/discourse/components/rag-uploader.gjs
@@ -78,10 +78,10 @@ export default class RagUploader extends Component {
   }
 
   get acceptedFileTypes() {
-    if (this.args?.allowPdfsAndImages) {
-      return ".txt,.md,.pdf,.png,.jpg,.jpeg";
+    if (this.args?.allowImages) {
+      return ".txt,.md,.png,.jpg,.jpeg";
     } else {
-      return ".txt,.md";
+      return ".txt,.md,.pdf";
     }
   }
 
@@ -127,8 +127,8 @@ export default class RagUploader extends Component {
   <template>
     <div class="rag-uploader">
       <h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3>
-      {{#if @allowPdfsAndImages}}
-        <p>{{i18n "discourse_ai.rag.uploads.description_with_pdfs"}}</p>
+      {{#if @allowImages}}
+        <p>{{i18n "discourse_ai.rag.uploads.description_with_images"}}</p>
       {{else}}
         <p>{{i18n "discourse_ai.rag.uploads.description"}}</p>
       {{/if}}

diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml
@@ -280,8 +280,8 @@ en:
           hide_indexing_options: "Hide upload options"
         uploads:
           title: "Uploads"
-          description: "Plaintext (.txt) or markdown (.md)"
-          description_with_pdfs: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
+          description: "PDF (.pdf), Plaintext (.txt) or markdown (.md)"
+          description_with_images: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
           button: "Add files"
           filter: "Filter uploads"
           indexed: "Indexed"

diff --git a/config/settings.yml b/config/settings.yml
@@ -355,6 +355,6 @@ discourse_ai:
     hidden: true
     type: list
 
-  ai_rag_pdf_images_enabled:
+  ai_rag_images_enabled:
     default: false
     hidden: true
diff --git a/evals/lib/eval.rb b/evals/lib/eval.rb
@@ -112,22 +112,13 @@ def pdf_to_text(llm, path:)
     upload =
       UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)
 
-    uploads =
-      DiscourseAi::Utils::PdfToImages.new(
-        upload: upload,
-        user: Discourse.system_user,
-      ).uploaded_pages
-
     text = +""
-    uploads.each do |page_upload|
-      DiscourseAi::Utils::ImageToText
-        .new(upload: page_upload, llm_model: llm.llm_model, user: Discourse.system_user)
-        .extract_text do |chunk, error|
-          text << chunk if chunk
-          text << "\n\n" if chunk
-        end
-      upload.destroy
-    end
+    DiscourseAi::Utils::PdfToText
+      .new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model)
+      .extract_text do |chunk|
+        text << chunk if chunk
+        text << "\n\n" if chunk
+      end
 
     text
   ensure

diff --git a/lib/utils/image_to_text.rb b/lib/utils/image_to_text.rb
@@ -50,12 +50,27 @@ def self.as_fake_file(uploads:, llm_model:, user:)
     Reader.new(uploads: uploads, llm_model: llm_model, user: user)
   end
 
+  def self.tesseract_installed?
+    if defined?(@tesseract_installed)
+      @tesseract_installed
+    else
+      @tesseract_installed =
+        begin
+          Discourse::Utils.execute_command("which", "tesseract")
+          true
+        rescue Discourse::Utils::CommandError
+          false
+        end
+    end
+  end
+
   attr_reader :upload, :llm_model, :user
 
-  def initialize(upload:, llm_model:, user:)
+  def initialize(upload:, llm_model:, user:, guidance_text: nil)
     @upload = upload
     @llm_model = llm_model
     @user = user
+    @guidance_text = guidance_text
   end
 
   def extract_text(retries: 3)
@@ -104,15 +119,16 @@ def system_message
   end
 
   def extract_text_from_page(page)
-    raw_text = extract_text_with_tesseract(page)
+    raw_text = @guidance_text
+    raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed?
 
     llm = llm_model.to_llm
     if raw_text.present?
       messages = [
         {
           type: :user,
           content:
-            "The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}",
+            "The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}",
           upload_ids: [page.id],
         },
       ]
@@ -127,6 +143,8 @@ def extract_text_from_page(page)
   end
 
   def extract_text_with_tesseract(page)
+    # return nil if we can not find tessaract binary
+    return nil if !Discourse::Utils.which("tesseract")
     upload_path =
       if page.local?
         Discourse.store.path_for(page)

diff --git a/lib/utils/pdf_to_images.rb b/lib/utils/pdf_to_images.rb
@@ -19,8 +19,6 @@ def uploaded_pages
   end
 
   def extract_pages
-    Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
-
     begin
       pdf_path =
         if upload.local?
@@ -31,6 +29,7 @@ def extract_pages
 
       raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
 
+      temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
       temp_pdf = File.join(temp_dir, "source.pdf")
       FileUtils.cp(pdf_path, temp_pdf)
 

diff --git a/lib/utils/pdf_to_text.rb b/lib/utils/pdf_to_text.rb
@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+
+class DiscourseAi::Utils::PdfToText
+  class Reader
+    def initialize(upload:, user: nil, llm_model: nil)
+      @extractor =
+        DiscourseAi::Utils::PdfToText.new(upload: upload, user: user, llm_model: llm_model)
+      @enumerator = create_enumerator
+      @buffer = +""
+    end
+
+    def read(length)
+      return @buffer.slice!(0, length) if !@buffer.empty?
+
+      begin
+        @buffer << @enumerator.next
+      rescue StopIteration
+        return nil
+      end
+
+      @buffer.slice!(0, length)
+    end
+
+    private
+
+    def create_enumerator
+      Enumerator.new { |yielder| @extractor.extract_text { |chunk| yielder.yield(chunk || "") } }
+    end
+  end
+
+  attr_reader :upload
+
+  def self.as_fake_file(upload:, user: nil, llm_model: nil)
+    Reader.new(upload: upload, user: user, llm_model: llm_model)
+  end
+
+  def initialize(upload:, user: nil, llm_model: nil)
+    @upload = upload
+    @user = user
+    @llm_model = llm_model
+  end
+
+  def extract_text
+    pdf_path =
+      if upload.local?
+        Discourse.store.path_for(upload)
+      else
+        Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
+      end
+
+    raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
+
+    require "pdf/reader"
+
+    page_number = 0
+    PDF::Reader.open(pdf_path) do |reader|
+      reader.pages.each do |page|
+        page_number += 1
+        llm_decorate(page_number: page_number, text: page.text, pdf_path: pdf_path) do |chunk|
+          yield chunk
+        end
+      end
+    end
+  end
+
+  def llm_decorate(page_number:, text:, pdf_path:)
+    raise "Must be called with block" if !block_given?
+    if !@llm_model
+      yield text
+      return
+    end
+
+    begin
+      temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
+      output_path = File.join(temp_dir, "page-#{page_number}.png")
+
+      # Extract specific page using ImageMagick
+      # image magick uses 0 based page numbers
+      command = [
+        "magick",
+        "-density",
+        "300",
+        "#{pdf_path}[#{page_number - 1}]",
+        "-background",
+        "white",
+        "-auto-orient",
+        "-quality",
+        "85",
+        output_path,
+      ]
+
+      Discourse::Utils.execute_command(
+        *command,
+        failure_message: "Failed to convert PDF page #{page_number} to image",
+        timeout: 30,
+      )
+
+      # TODO - we are creating leftover uploads, they will be cleaned up
+      # but maybe we should just keep them around?
+      upload =
+        UploadCreator.new(File.open(output_path), "page-#{page_number}.png").create_for(@user&.id)
+
+      DiscourseAi::Utils::ImageToText
+        .new(upload: upload, llm_model: @llm_model, user: @user, guidance_text: text)
+        .extract_text { |chunk| yield chunk }
+    ensure
+      FileUtils.rm_rf(temp_dir) if temp_dir
+    end
+  end
+end
diff --git a/plugin.rb b/plugin.rb
@@ -12,6 +12,16 @@
 gem "tiktoken_ruby", "0.0.9"
 gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this
 
+# we probably want to move all dependencies directly in to the Discourse Gemfile, this
+# will give us a strong guarantee that the dependencies are compatible and keep getting upgraded
+gem "Ascii85", "2.0.1", require: false
+gem "ruby-rc4", "0.1.5", require: false
+gem "hashery", "2.1.2", require: false
+gem "ttfunk", "1.8.0", require: false
+gem "afm", "0.2.2", require: false
+# all above are required by pdf-reader
+gem "pdf-reader", "2.14.1", require: false
+
 enabled_site_setting :discourse_ai_enabled
 
 register_asset "stylesheets/common/streaming.scss"

diff --git a/spec/fixtures/rag/2-page.pdf b/spec/fixtures/rag/2-page.pdf
diff --git a/spec/jobs/regular/digest_rag_upload_spec.rb b/spec/jobs/regular/digest_rag_upload_spec.rb
@@ -3,7 +3,7 @@
 RSpec.describe Jobs::DigestRagUpload do
   fab!(:persona) { Fabricate(:ai_persona) }
   fab!(:upload) { Fabricate(:upload, extension: "txt") }
-  fab!(:pdf_upload) { Fabricate(:upload, extension: "pdf") }
+  fab!(:image_upload) { Fabricate(:upload, extension: "png") }
   let(:document_file) { StringIO.new("some text" * 200) }
 
   fab!(:cloudflare_embedding_def)
@@ -31,13 +31,13 @@
   end
 
   describe "#execute" do
-    context "when processing a PDF upload" do
+    context "when processing an image upload" do
       it "will reject the indexing if the site setting is not enabled" do
-        SiteSetting.ai_rag_pdf_images_enabled = false
+        SiteSetting.ai_rag_images_enabled = false
 
         expect {
           described_class.new.execute(
-            upload_id: pdf_upload.id,
+            upload_id: image_upload.id,
             target_id: persona.id,
             target_type: persona.class.to_s,
           )