Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def index
tools: tools,
llms: llms,
settings: {
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
rag_images_enabled: SiteSetting.ai_rag_images_enabled,
},
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def upload_file

def validate_extension!(filename)
extension = File.extname(filename)[1..-1] || ""
authorized_extensions = %w[txt md]
authorized_extensions.concat(%w[pdf png jpg jpeg]) if SiteSetting.ai_rag_pdf_images_enabled
authorized_extensions = %w[txt md pdf]
authorized_extensions.concat(%w[png jpg jpeg]) if SiteSetting.ai_rag_images_enabled
if !authorized_extensions.include?(extension)
raise Discourse::InvalidParameters.new(
I18n.t(
Expand Down
16 changes: 5 additions & 11 deletions app/jobs/regular/digest_rag_upload.rb
Original file line number Diff line number Diff line change
Expand Up @@ -164,22 +164,16 @@ def first_chunk(text, chunk_tokens:, tokenizer:, splitters: ["\n\n", "\n", ".",
end

def get_uploaded_file(upload:, target:)
if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled
if %w[png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_images_enabled
raise Discourse::InvalidAccess.new(
"The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.",
"The setting ai_rag_images_enabled is false, can not index images",
)
end
if upload.extension == "pdf"
pages =
DiscourseAi::Utils::PdfToImages.new(
upload: upload,
user: Discourse.system_user,
).uploaded_pages

return(
DiscourseAi::Utils::ImageToText.as_fake_file(
uploads: pages,
llm_model: target.rag_llm_model,
DiscourseAi::Utils::PdfToText.as_fake_file(
upload: upload,
llm_model: SiteSetting.ai_rag_images_enabled ? target.rag_llm_model : nil,
user: Discourse.system_user,
)
)
Expand Down
2 changes: 1 addition & 1 deletion app/serializers/ai_custom_tool_list_serializer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def meta
presets: AiTool.presets,
llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization,
settings: {
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
rag_images_enabled: SiteSetting.ai_rag_images_enabled,
},
}
end
Expand Down
4 changes: 2 additions & 2 deletions assets/javascripts/discourse/components/ai-persona-editor.gjs
Original file line number Diff line number Diff line change
Expand Up @@ -596,13 +596,13 @@ export default class PersonaEditor extends Component {
@target={{this.editingModel}}
@updateUploads={{this.updateUploads}}
@onRemove={{this.removeUpload}}
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
@allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
/>
</div>
<RagOptions
@model={{this.editingModel}}
@llms={{@personas.resultSetMeta.llms}}
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
@allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
>
<div class="control-group">
<label>{{i18n
Expand Down
4 changes: 2 additions & 2 deletions assets/javascripts/discourse/components/ai-tool-editor.gjs
Original file line number Diff line number Diff line change
Expand Up @@ -245,13 +245,13 @@ export default class AiToolEditor extends Component {
@target={{this.editingModel}}
@updateUploads={{this.updateUploads}}
@onRemove={{this.removeUpload}}
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
@allowImages={{@settings.rag_images_enabled}}
/>
</div>
<RagOptions
@model={{this.editingModel}}
@llms={{@llms}}
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
@allowImages={{@settings.rag_images_enabled}}
/>
{{/if}}

Expand Down
2 changes: 1 addition & 1 deletion assets/javascripts/discourse/components/rag-options.gjs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ export default class RagOptions extends Component {
}}
/>
</div>
{{#if @allowPdfsAndImages}}
{{#if @allowImages}}
<div class="control-group">
<label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label>
<AiLlmSelector
Expand Down
10 changes: 5 additions & 5 deletions assets/javascripts/discourse/components/rag-uploader.gjs
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ export default class RagUploader extends Component {
}

get acceptedFileTypes() {
if (this.args?.allowPdfsAndImages) {
return ".txt,.md,.pdf,.png,.jpg,.jpeg";
if (this.args?.allowImages) {
return ".txt,.md,.png,.jpg,.jpeg";
} else {
return ".txt,.md";
return ".txt,.md,.pdf";
}
}

Expand Down Expand Up @@ -127,8 +127,8 @@ export default class RagUploader extends Component {
<template>
<div class="rag-uploader">
<h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3>
{{#if @allowPdfsAndImages}}
<p>{{i18n "discourse_ai.rag.uploads.description_with_pdfs"}}</p>
{{#if @allowImages}}
<p>{{i18n "discourse_ai.rag.uploads.description_with_images"}}</p>
{{else}}
<p>{{i18n "discourse_ai.rag.uploads.description"}}</p>
{{/if}}
Expand Down
4 changes: 2 additions & 2 deletions config/locales/client.en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,8 @@ en:
hide_indexing_options: "Hide upload options"
uploads:
title: "Uploads"
description: "Plaintext (.txt) or markdown (.md)"
description_with_pdfs: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
description: "PDF (.pdf), Plaintext (.txt) or markdown (.md)"
description_with_images: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
button: "Add files"
filter: "Filter uploads"
indexed: "Indexed"
Expand Down
2 changes: 1 addition & 1 deletion config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,6 @@ discourse_ai:
hidden: true
type: list

ai_rag_pdf_images_enabled:
ai_rag_images_enabled:
default: false
hidden: true
21 changes: 6 additions & 15 deletions evals/lib/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -112,22 +112,13 @@ def pdf_to_text(llm, path:)
upload =
UploadCreator.new(File.open(path), File.basename(path)).create_for(Discourse.system_user.id)

uploads =
DiscourseAi::Utils::PdfToImages.new(
upload: upload,
user: Discourse.system_user,
).uploaded_pages

text = +""
uploads.each do |page_upload|
DiscourseAi::Utils::ImageToText
.new(upload: page_upload, llm_model: llm.llm_model, user: Discourse.system_user)
.extract_text do |chunk, error|
text << chunk if chunk
text << "\n\n" if chunk
end
upload.destroy
end
DiscourseAi::Utils::PdfToText
.new(upload: upload, user: Discourse.system_user, llm_model: llm.llm_model)
.extract_text do |chunk|
text << chunk if chunk
text << "\n\n" if chunk
end

text
ensure
Expand Down
24 changes: 21 additions & 3 deletions lib/utils/image_to_text.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,27 @@ def self.as_fake_file(uploads:, llm_model:, user:)
Reader.new(uploads: uploads, llm_model: llm_model, user: user)
end

def self.tesseract_installed?
if defined?(@tesseract_installed)
@tesseract_installed
else
@tesseract_installed =
begin
Discourse::Utils.execute_command("which", "tesseract")
true
rescue Discourse::Utils::CommandError
false
end
end
end

attr_reader :upload, :llm_model, :user

def initialize(upload:, llm_model:, user:)
def initialize(upload:, llm_model:, user:, guidance_text: nil)
@upload = upload
@llm_model = llm_model
@user = user
@guidance_text = guidance_text
end

def extract_text(retries: 3)
Expand Down Expand Up @@ -104,15 +119,16 @@ def system_message
end

def extract_text_from_page(page)
raw_text = extract_text_with_tesseract(page)
raw_text = @guidance_text
raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed?

llm = llm_model.to_llm
if raw_text.present?
messages = [
{
type: :user,
content:
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original meaning:\n\n#{raw_text}",
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}",
upload_ids: [page.id],
},
]
Expand All @@ -127,6 +143,8 @@ def extract_text_from_page(page)
end

def extract_text_with_tesseract(page)
# return nil if we can not find tessaract binary
return nil if !Discourse::Utils.which("tesseract")
upload_path =
if page.local?
Discourse.store.path_for(page)
Expand Down
3 changes: 1 addition & 2 deletions lib/utils/pdf_to_images.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ def uploaded_pages
end

def extract_pages
Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")

begin
pdf_path =
if upload.local?
Expand All @@ -31,6 +29,7 @@ def extract_pages

raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?

temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
temp_pdf = File.join(temp_dir, "source.pdf")
FileUtils.cp(pdf_path, temp_pdf)

Expand Down
110 changes: 110 additions & 0 deletions lib/utils/pdf_to_text.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# frozen_string_literal: true

class DiscourseAi::Utils::PdfToText
class Reader
def initialize(upload:, user: nil, llm_model: nil)
@extractor =
DiscourseAi::Utils::PdfToText.new(upload: upload, user: user, llm_model: llm_model)
@enumerator = create_enumerator
@buffer = +""
end

def read(length)
return @buffer.slice!(0, length) if !@buffer.empty?

begin
@buffer << @enumerator.next
rescue StopIteration
return nil
end

@buffer.slice!(0, length)
end

private

def create_enumerator
Enumerator.new { |yielder| @extractor.extract_text { |chunk| yielder.yield(chunk || "") } }
end
end

attr_reader :upload

def self.as_fake_file(upload:, user: nil, llm_model: nil)
Reader.new(upload: upload, user: user, llm_model: llm_model)
end

def initialize(upload:, user: nil, llm_model: nil)
@upload = upload
@user = user
@llm_model = llm_model
end

def extract_text
pdf_path =
if upload.local?
Discourse.store.path_for(upload)
else
Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem like we are cleaning up this file after downloading it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we clean up anywhere though? this is a bit tricky, no clean pattern for this.

end

raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?

require "pdf/reader"

page_number = 0
PDF::Reader.open(pdf_path) do |reader|
reader.pages.each do |page|
page_number += 1
llm_decorate(page_number: page_number, text: page.text, pdf_path: pdf_path) do |chunk|
yield chunk
end
end
end
end

def llm_decorate(page_number:, text:, pdf_path:)
raise "Must be called with block" if !block_given?
if !@llm_model
yield text
return
end

begin
temp_dir = Dir.mktmpdir("discourse-pdf-#{SecureRandom.hex(8)}")
output_path = File.join(temp_dir, "page-#{page_number}.png")

# Extract specific page using ImageMagick
# image magick uses 0 based page numbers
command = [
"magick",
"-density",
"300",
"#{pdf_path}[#{page_number - 1}]",
"-background",
"white",
"-auto-orient",
"-quality",
"85",
output_path,
]

Discourse::Utils.execute_command(
*command,
failure_message: "Failed to convert PDF page #{page_number} to image",
timeout: 30,
)

# TODO - we are creating leftover uploads, they will be cleaned up
# but maybe we should just keep them around?
upload =
UploadCreator.new(File.open(output_path), "page-#{page_number}.png").create_for(@user&.id)

DiscourseAi::Utils::ImageToText
.new(upload: upload, llm_model: @llm_model, user: @user, guidance_text: text)
.extract_text { |chunk| yield chunk }
ensure
FileUtils.rm_rf(temp_dir) if temp_dir
end
end
end
10 changes: 10 additions & 0 deletions plugin.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@
gem "tiktoken_ruby", "0.0.9"
gem "ed25519", "1.2.4" #TODO remove this as existing ssl gem should handle this

# we probably want to move all dependencies directly in to the Discourse Gemfile, this
# will give us a strong guarantee that the dependencies are compatible and keep getting upgraded
gem "Ascii85", "2.0.1", require: false
gem "ruby-rc4", "0.1.5", require: false
gem "hashery", "2.1.2", require: false
gem "ttfunk", "1.8.0", require: false
gem "afm", "0.2.2", require: false
# all above are required by pdf-reader
gem "pdf-reader", "2.14.1", require: false

enabled_site_setting :discourse_ai_enabled

register_asset "stylesheets/common/streaming.scss"
Expand Down
Binary file added spec/fixtures/rag/2-page.pdf
Binary file not shown.
8 changes: 4 additions & 4 deletions spec/jobs/regular/digest_rag_upload_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
RSpec.describe Jobs::DigestRagUpload do
fab!(:persona) { Fabricate(:ai_persona) }
fab!(:upload) { Fabricate(:upload, extension: "txt") }
fab!(:pdf_upload) { Fabricate(:upload, extension: "pdf") }
fab!(:image_upload) { Fabricate(:upload, extension: "png") }
let(:document_file) { StringIO.new("some text" * 200) }

fab!(:cloudflare_embedding_def)
Expand Down Expand Up @@ -31,13 +31,13 @@
end

describe "#execute" do
context "when processing a PDF upload" do
context "when processing an image upload" do
it "will reject the indexing if the site setting is not enabled" do
SiteSetting.ai_rag_pdf_images_enabled = false
SiteSetting.ai_rag_images_enabled = false

expect {
described_class.new.execute(
upload_id: pdf_upload.id,
upload_id: image_upload.id,
target_id: persona.id,
target_type: persona.class.to_s,
)
Expand Down
Loading