Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit ce79a18

Browse files
SamSaffrontgxworld
andauthored
FEATURE: Native PDF support (#1127)
* FEATURE: Native PDF support This amends it so we use PDF Reader gem to extract text from PDFs * This means that our simple pdf eval passes at last * fix spec * skip test in CI * test file support * Update lib/utils/image_to_text.rb Co-authored-by: Alan Guo Xiang Tan <[email protected]> * address pr comments --------- Co-authored-by: Alan Guo Xiang Tan <[email protected]>
1 parent 9a6aec2 commit ce79a18

File tree

19 files changed

+248
-62
lines changed

19 files changed

+248
-62
lines changed

app/controllers/discourse_ai/admin/ai_personas_controller.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def index
4141
tools: tools,
4242
llms: llms,
4343
settings: {
44-
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
44+
rag_images_enabled: SiteSetting.ai_rag_images_enabled,
4545
},
4646
},
4747
}

app/controllers/discourse_ai/admin/rag_document_fragments_controller.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ def upload_file
4848

4949
def validate_extension!(filename)
5050
extension = File.extname(filename)[1..-1] || ""
51-
authorized_extensions = %w[txt md]
52-
authorized_extensions.concat(%w[pdf png jpg jpeg]) if SiteSetting.ai_rag_pdf_images_enabled
51+
authorized_extensions = %w[txt md pdf]
52+
authorized_extensions.concat(%w[png jpg jpeg]) if SiteSetting.ai_rag_images_enabled
5353
if !authorized_extensions.include?(extension)
5454
raise Discourse::InvalidParameters.new(
5555
I18n.t(

app/jobs/regular/digest_rag_upload.rb

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -164,22 +164,16 @@ def first_chunk(text, chunk_tokens:, tokenizer:, splitters: ["\n\n", "\n", ".",
164164
end
165165

166166
def get_uploaded_file(upload:, target:)
167-
if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled
167+
if %w[png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_images_enabled
168168
raise Discourse::InvalidAccess.new(
169-
"The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.",
169+
"The setting ai_rag_images_enabled is false, can not index images",
170170
)
171171
end
172172
if upload.extension == "pdf"
173-
pages =
174-
DiscourseAi::Utils::PdfToImages.new(
175-
upload: upload,
176-
user: Discourse.system_user,
177-
).uploaded_pages
178-
179173
return(
180-
DiscourseAi::Utils::ImageToText.as_fake_file(
181-
uploads: pages,
182-
llm_model: target.rag_llm_model,
174+
DiscourseAi::Utils::PdfToText.as_fake_file(
175+
upload: upload,
176+
llm_model: SiteSetting.ai_rag_images_enabled ? target.rag_llm_model : nil,
183177
user: Discourse.system_user,
184178
)
185179
)

app/serializers/ai_custom_tool_list_serializer.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def meta
1010
presets: AiTool.presets,
1111
llms: DiscourseAi::Configuration::LlmEnumerator.values_for_serialization,
1212
settings: {
13-
rag_pdf_images_enabled: SiteSetting.ai_rag_pdf_images_enabled,
13+
rag_images_enabled: SiteSetting.ai_rag_images_enabled,
1414
},
1515
}
1616
end

assets/javascripts/discourse/components/ai-persona-editor.gjs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,13 +596,13 @@ export default class PersonaEditor extends Component {
596596
@target={{this.editingModel}}
597597
@updateUploads={{this.updateUploads}}
598598
@onRemove={{this.removeUpload}}
599-
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
599+
@allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
600600
/>
601601
</div>
602602
<RagOptions
603603
@model={{this.editingModel}}
604604
@llms={{@personas.resultSetMeta.llms}}
605-
@allowPdfsAndImages={{@personas.resultSetMeta.settings.rag_pdf_images_enabled}}
605+
@allowImages={{@personas.resultSetMeta.settings.rag_images_enabled}}
606606
>
607607
<div class="control-group">
608608
<label>{{i18n

assets/javascripts/discourse/components/ai-tool-editor.gjs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -245,13 +245,13 @@ export default class AiToolEditor extends Component {
245245
@target={{this.editingModel}}
246246
@updateUploads={{this.updateUploads}}
247247
@onRemove={{this.removeUpload}}
248-
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
248+
@allowImages={{@settings.rag_images_enabled}}
249249
/>
250250
</div>
251251
<RagOptions
252252
@model={{this.editingModel}}
253253
@llms={{@llms}}
254-
@allowPdfsAndImages={{@settings.rag_pdf_images_enabled}}
254+
@allowImages={{@settings.rag_images_enabled}}
255255
/>
256256
{{/if}}
257257

assets/javascripts/discourse/components/rag-options.gjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ export default class RagOptions extends Component {
8181
}}
8282
/>
8383
</div>
84-
{{#if @allowPdfsAndImages}}
84+
{{#if @allowImages}}
8585
<div class="control-group">
8686
<label>{{i18n "discourse_ai.rag.options.rag_llm_model"}}</label>
8787
<AiLlmSelector

assets/javascripts/discourse/components/rag-uploader.gjs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ export default class RagUploader extends Component {
7878
}
7979

8080
get acceptedFileTypes() {
81-
if (this.args?.allowPdfsAndImages) {
82-
return ".txt,.md,.pdf,.png,.jpg,.jpeg";
81+
if (this.args?.allowImages) {
82+
return ".txt,.md,.png,.jpg,.jpeg";
8383
} else {
84-
return ".txt,.md";
84+
return ".txt,.md,.pdf";
8585
}
8686
}
8787

@@ -127,8 +127,8 @@ export default class RagUploader extends Component {
127127
<template>
128128
<div class="rag-uploader">
129129
<h3>{{i18n "discourse_ai.rag.uploads.title"}}</h3>
130-
{{#if @allowPdfsAndImages}}
131-
<p>{{i18n "discourse_ai.rag.uploads.description_with_pdfs"}}</p>
130+
{{#if @allowImages}}
131+
<p>{{i18n "discourse_ai.rag.uploads.description_with_images"}}</p>
132132
{{else}}
133133
<p>{{i18n "discourse_ai.rag.uploads.description"}}</p>
134134
{{/if}}

config/locales/client.en.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,8 @@ en:
280280
hide_indexing_options: "Hide upload options"
281281
uploads:
282282
title: "Uploads"
283-
description: "Plaintext (.txt) or markdown (.md)"
284-
description_with_pdfs: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
283+
description: "PDF (.pdf), Plaintext (.txt) or markdown (.md)"
284+
description_with_images: "Plaintext (.txt), markdown (.md), PDF (.pdf) or image (.png, .jpeg)"
285285
button: "Add files"
286286
filter: "Filter uploads"
287287
indexed: "Indexed"

config/settings.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,6 @@ discourse_ai:
355355
hidden: true
356356
type: list
357357

358-
ai_rag_pdf_images_enabled:
358+
ai_rag_images_enabled:
359359
default: false
360360
hidden: true

0 commit comments

Comments
 (0)