-
Notifications
You must be signed in to change notification settings - Fork 40
FEATURE: PDF support for rag pipeline #1118
Changes from 27 commits
2d9c72c
3c7dd74
b64511e
34b9521
2032f5f
d4695ec
4d231c3
0875406
18c6a80
ace9f94
4d1798c
fdd4a9b
2181e2a
72e9576
4ba0d5c
e2f71f1
938a445
848692c
2f4276a
5e9cb80
5736da6
db6e28a
f5ce2db
25c97ca
b0a549b
10ea742
bcb7cdf
8393dd7
3c054e2
bfa6a40
1de56d8
888237a
be97405
c0f181f
b3fcf3f
7064d4b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,3 +2,5 @@ node_modules | |
| /gems | ||
| /auto_generated | ||
| .env | ||
| evals/log | ||
| evals/cases | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -28,7 +28,7 @@ def execute(args) | |||||
|
|
||||||
| # Check if this is the first time we process this upload. | ||||||
| if fragment_ids.empty? | ||||||
| document = get_uploaded_file(upload) | ||||||
| document = get_uploaded_file(upload: upload, target: target) | ||||||
| return if document.nil? | ||||||
|
|
||||||
| RagDocumentFragment.publish_status(upload, { total: 0, indexed: 0, left: 0 }) | ||||||
|
|
@@ -163,7 +163,38 @@ def first_chunk(text, chunk_tokens:, tokenizer:, splitters: ["\n\n", "\n", ".", | |||||
| [buffer, split_char] | ||||||
| end | ||||||
|
|
||||||
| def get_uploaded_file(upload) | ||||||
| def get_uploaded_file(upload:, target:) | ||||||
| if %w[pdf png jpg jpeg].include?(upload.extension) && !SiteSetting.ai_rag_pdf_images_enabled | ||||||
| raise Discourse::InvalidAccess.new( | ||||||
| "The setting ai_rag_pdf_images_enabled is false, can not index images and pdfs.", | ||||||
| ) | ||||||
| end | ||||||
| if upload.extension == "pdf" | ||||||
| pages = | ||||||
| DiscourseAi::Utils::PdfToImages.new( | ||||||
| upload: upload, | ||||||
| user: Discourse.system_user, | ||||||
| ).uploaded_pages | ||||||
|
|
||||||
| return( | ||||||
| DiscourseAi::Utils::ImageToText.as_fake_file( | ||||||
| uploads: pages, | ||||||
| llm_model: target.rag_llm_model, | ||||||
| user: Discourse.system_user, | ||||||
| ) | ||||||
| ) | ||||||
| end | ||||||
|
|
||||||
| if %w[png jpg jpeg].include?(upload.extension) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Part of my above comment, we can easily call checks for extensions if we create a helper somewhere.
Suggested change
|
||||||
| return( | ||||||
| DiscourseAi::Utils::ImageToText.as_fake_file( | ||||||
| uploads: [upload], | ||||||
| llm_model: target.rag_llm_model, | ||||||
| user: Discourse.system_user, | ||||||
| ) | ||||||
| ) | ||||||
| end | ||||||
|
|
||||||
| store = Discourse.store | ||||||
| @file ||= | ||||||
| if store.external? | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,8 @@ | ||
| # frozen_string_literal: true | ||
|
|
||
| class AiPersona < ActiveRecord::Base | ||
| # TODO remove this line 01-1-2025 | ||
| self.ignored_columns = %i[commands allow_chat mentionable] | ||
| # TODO remove this line 01-10-2025 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you mean
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh dates ... this is D-M-Y ... generally I try to stick with that for code comments. dates are hard, maybe for comments like this we should go with October-2025, a lot less ambiguous |
||
| self.ignored_columns = %i[default_llm question_consolidator_llm] | ||
|
|
||
| # places a hard limit, so per site we cache a maximum of 500 classes | ||
| MAX_PERSONAS_PER_SITE = 500 | ||
|
|
@@ -12,7 +12,7 @@ class AiPersona < ActiveRecord::Base | |
| validates :system_prompt, presence: true, length: { maximum: 10_000_000 } | ||
| validate :system_persona_unchangeable, on: :update, if: :system | ||
| validate :chat_preconditions | ||
| validate :allowed_seeded_model, if: :default_llm | ||
| validate :allowed_seeded_model, if: :default_llm_id | ||
| validates :max_context_posts, numericality: { greater_than: 0 }, allow_nil: true | ||
| # leaves some room for growth but sets a maximum to avoid memory issues | ||
| # we may want to revisit this in the future | ||
|
|
@@ -30,6 +30,10 @@ class AiPersona < ActiveRecord::Base | |
| belongs_to :created_by, class_name: "User" | ||
| belongs_to :user | ||
|
|
||
| belongs_to :default_llm, class_name: "LlmModel" | ||
| belongs_to :question_consolidator_llm, class_name: "LlmModel" | ||
| belongs_to :rag_llm_model, class_name: "LlmModel" | ||
|
|
||
| has_many :upload_references, as: :target, dependent: :destroy | ||
| has_many :uploads, through: :upload_references | ||
|
|
||
|
|
@@ -62,7 +66,7 @@ def self.persona_users(user: nil) | |
| user_id: persona.user_id, | ||
| username: persona.user.username_lower, | ||
| allowed_group_ids: persona.allowed_group_ids, | ||
| default_llm: persona.default_llm, | ||
| default_llm_id: persona.default_llm_id, | ||
| force_default_llm: persona.force_default_llm, | ||
| allow_chat_channel_mentions: persona.allow_chat_channel_mentions, | ||
| allow_chat_direct_messages: persona.allow_chat_direct_messages, | ||
|
|
@@ -157,12 +161,12 @@ def class_instance | |
| user_id | ||
| system | ||
| mentionable | ||
| default_llm | ||
| default_llm_id | ||
| max_context_posts | ||
| vision_enabled | ||
| vision_max_pixels | ||
| rag_conversation_chunks | ||
| question_consolidator_llm | ||
| question_consolidator_llm_id | ||
| allow_chat_channel_mentions | ||
| allow_chat_direct_messages | ||
| allow_topic_mentions | ||
|
|
@@ -302,7 +306,7 @@ def chat_preconditions | |
| if ( | ||
| allow_chat_channel_mentions || allow_chat_direct_messages || allow_topic_mentions || | ||
| force_default_llm | ||
| ) && !default_llm | ||
| ) && !default_llm_id | ||
| errors.add(:default_llm, I18n.t("discourse_ai.ai_bot.personas.default_llm_required")) | ||
| end | ||
| end | ||
|
|
@@ -332,13 +336,12 @@ def ensure_not_system | |
| end | ||
|
|
||
| def allowed_seeded_model | ||
| return if default_llm.blank? | ||
| return if default_llm_id.blank? | ||
|
|
||
| llm = LlmModel.find_by(id: default_llm.split(":").last.to_i) | ||
| return if llm.nil? | ||
| return if !llm.seeded? | ||
| return if default_llm.nil? | ||
| return if !default_llm.seeded? | ||
|
|
||
| return if SiteSetting.ai_bot_allowed_seeded_models.include?(llm.id.to_s) | ||
| return if SiteSetting.ai_bot_allowed_seeded_models_map.include?(default_llm.id.to_s) | ||
|
|
||
| errors.add(:default_llm, I18n.t("discourse_ai.llm.configuration.invalid_seeded_model")) | ||
| end | ||
|
|
@@ -348,36 +351,37 @@ def allowed_seeded_model | |
| # | ||
| # Table name: ai_personas | ||
| # | ||
| # id :bigint not null, primary key | ||
| # name :string(100) not null | ||
| # description :string(2000) not null | ||
| # system_prompt :string(10000000) not null | ||
| # allowed_group_ids :integer default([]), not null, is an Array | ||
| # created_by_id :integer | ||
| # enabled :boolean default(TRUE), not null | ||
| # created_at :datetime not null | ||
| # updated_at :datetime not null | ||
| # system :boolean default(FALSE), not null | ||
| # priority :boolean default(FALSE), not null | ||
| # temperature :float | ||
| # top_p :float | ||
| # user_id :integer | ||
| # default_llm :text | ||
| # max_context_posts :integer | ||
| # vision_enabled :boolean default(FALSE), not null | ||
| # vision_max_pixels :integer default(1048576), not null | ||
| # rag_chunk_tokens :integer default(374), not null | ||
| # rag_chunk_overlap_tokens :integer default(10), not null | ||
| # rag_conversation_chunks :integer default(10), not null | ||
| # question_consolidator_llm :text | ||
| # tool_details :boolean default(TRUE), not null | ||
| # tools :json not null | ||
| # forced_tool_count :integer default(-1), not null | ||
| # allow_chat_channel_mentions :boolean default(FALSE), not null | ||
| # allow_chat_direct_messages :boolean default(FALSE), not null | ||
| # allow_topic_mentions :boolean default(FALSE), not null | ||
| # allow_personal_messages :boolean default(TRUE), not null | ||
| # force_default_llm :boolean default(FALSE), not null | ||
| # id :bigint not null, primary key | ||
| # name :string(100) not null | ||
| # description :string(2000) not null | ||
| # system_prompt :string(10000000) not null | ||
| # allowed_group_ids :integer default([]), not null, is an Array | ||
| # created_by_id :integer | ||
| # enabled :boolean default(TRUE), not null | ||
| # created_at :datetime not null | ||
| # updated_at :datetime not null | ||
| # system :boolean default(FALSE), not null | ||
| # priority :boolean default(FALSE), not null | ||
| # temperature :float | ||
| # top_p :float | ||
| # user_id :integer | ||
| # max_context_posts :integer | ||
| # vision_enabled :boolean default(FALSE), not null | ||
| # vision_max_pixels :integer default(1048576), not null | ||
| # rag_chunk_tokens :integer default(374), not null | ||
| # rag_chunk_overlap_tokens :integer default(10), not null | ||
| # rag_conversation_chunks :integer default(10), not null | ||
| # tool_details :boolean default(TRUE), not null | ||
| # tools :json not null | ||
| # forced_tool_count :integer default(-1), not null | ||
| # allow_chat_channel_mentions :boolean default(FALSE), not null | ||
| # allow_chat_direct_messages :boolean default(FALSE), not null | ||
| # allow_topic_mentions :boolean default(FALSE), not null | ||
| # allow_personal_messages :boolean default(TRUE), not null | ||
| # force_default_llm :boolean default(FALSE), not null | ||
| # rag_llm_model_id :bigint | ||
| # default_llm_id :bigint | ||
| # question_consolidator_llm_id :bigint | ||
| # | ||
| # Indexes | ||
| # | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.