Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions lib/ai_helper/semantic_categorizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ class SemanticCategorizer
def initialize(input, user)
@user = user
@text = input[:text]
@vector = DiscourseAi::Embeddings::Vector.instance
@schema = DiscourseAi::Embeddings::Schema.for(Topic)
end

def categories
return [] if @text.blank?
return [] if !DiscourseAi::Embeddings.enabled?

candidates = nearest_neighbors(limit: 100)
candidates = nearest_neighbors
return [] if candidates.empty?

candidate_ids = candidates.map(&:first)
Expand Down Expand Up @@ -40,6 +42,9 @@ def categories
}
end
.map do |c|
# Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators
c[:score] = (c[:score] + 1).abs if @vector.vdef.pg_function = "<#>"

c[:score] = 1 / (c[:score] + 1) # inverse of the distance
c
end
Expand Down Expand Up @@ -72,6 +77,9 @@ def tags
.with_index { |tag_list, index| { tags: tag_list, score: candidates[index].last } }
.flat_map { |c| c[:tags].map { |t| { name: t, score: c[:score] } } }
.map do |c|
# Note: <#> returns the negative inner product since Postgres only supports ASC order index scans on operators
c[:score] = (c[:score] + 1).abs if @vector.vdef.pg_function = "<#>"

c[:score] = 1 / (c[:score] + 1) # inverse of the distance
c
end
Expand All @@ -91,11 +99,8 @@ def tags

private

def nearest_neighbors(limit: 100)
vector = DiscourseAi::Embeddings::Vector.instance
schema = DiscourseAi::Embeddings::Schema.for(Topic)

raw_vector = vector.vector_from(@text)
def nearest_neighbors(limit: 50)
raw_vector = @vector.vector_from(@text)

muted_category_ids = nil
if @user.present?
Expand All @@ -106,7 +111,7 @@ def nearest_neighbors(limit: 100)
).pluck(:category_id)
end

schema
@schema
.asymmetric_similarity_search(raw_vector, limit: limit, offset: 0) do |builder|
builder.join("topics t on t.id = topic_id")
unless muted_category_ids.empty?
Expand Down
40 changes: 30 additions & 10 deletions lib/embeddings/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class Schema
EMBEDDING_TARGETS = %w[topics posts document_fragments]
EMBEDDING_TABLES = [TOPICS_TABLE, POSTS_TABLE, RAG_DOCS_TABLE]

DEFAULT_HNSW_EF_SEARCH = 40

MissingEmbeddingError = Class.new(StandardError)

class << self
Expand Down Expand Up @@ -132,6 +134,8 @@ def find_by_target(target)
end

def asymmetric_similarity_search(embedding, limit:, offset:)
before_query = hnsw_search_workaround(limit)

builder = DB.build(<<~SQL)
WITH candidates AS (
SELECT
Expand All @@ -153,7 +157,7 @@ def asymmetric_similarity_search(embedding, limit:, offset:)
ORDER BY
embeddings::halfvec(#{dimensions}) #{pg_function} '[:query_embedding]'::halfvec(#{dimensions})
LIMIT :limit
OFFSET :offset
OFFSET :offset;
SQL

builder.where(
Expand All @@ -171,18 +175,24 @@ def asymmetric_similarity_search(embedding, limit:, offset:)
candidates_limit = limit * 2
end

builder.query(
query_embedding: embedding,
candidates_limit: candidates_limit,
limit: limit,
offset: offset,
)
ActiveRecord::Base.transaction do
DB.exec(before_query) if before_query.present?
builder.query(
query_embedding: embedding,
candidates_limit: candidates_limit,
limit: limit,
offset: offset,
)
end
rescue PG::Error => e
Rails.logger.error("Error #{e} querying embeddings for model #{vector_def.display_name}")
raise MissingEmbeddingError
end

def symmetric_similarity_search(record)
limit = 200
before_query = hnsw_search_workaround(limit)

builder = DB.build(<<~SQL)
WITH le_target AS (
SELECT
Expand Down Expand Up @@ -210,7 +220,7 @@ def symmetric_similarity_search(record)
le_target
LIMIT 1
)
LIMIT 200
LIMIT #{limit}
) AS widenet
ORDER BY
embeddings::halfvec(#{dimensions}) #{pg_function} (
Expand All @@ -220,14 +230,17 @@ def symmetric_similarity_search(record)
le_target
LIMIT 1
)
LIMIT 100;
LIMIT #{limit / 2};
SQL

builder.where("model_id = :vid AND strategy_id = :vsid")

yield(builder) if block_given?

builder.query(vid: vector_def.id, vsid: vector_def.strategy_id, target_id: record.id)
ActiveRecord::Base.transaction do
DB.exec(before_query) if before_query.present?
builder.query(vid: vector_def.id, vsid: vector_def.strategy_id, target_id: record.id)
end
rescue PG::Error => e
Rails.logger.error("Error #{e} querying embeddings for model #{vector_def.display_name}")
raise MissingEmbeddingError
Expand Down Expand Up @@ -259,6 +272,13 @@ def store(record, embedding, digest)

private

def hnsw_search_workaround(limit)
threshold = limit * 2

return "" if threshold < DEFAULT_HNSW_EF_SEARCH
"SET LOCAL hnsw.ef_search = #{threshold};"
end

delegate :dimensions, :pg_function, to: :vector_def
end
end
Expand Down
11 changes: 9 additions & 2 deletions spec/system/ai_helper/ai_composer_helper_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
RSpec.describe "AI Composer helper", type: :system, js: true do
fab!(:user) { Fabricate(:admin, refresh_auto_groups: true) }
fab!(:non_member_group) { Fabricate(:group) }
fab!(:embedding_definition)

before do
Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user)
Expand Down Expand Up @@ -243,7 +244,10 @@ def trigger_composer_helper(content)
end

context "when suggesting the category with AI category suggester" do
before { SiteSetting.ai_embeddings_enabled = true }
before do
SiteSetting.ai_embeddings_selected_model = embedding_definition.id
SiteSetting.ai_embeddings_enabled = true
end

it "updates the category with the suggested category" do
response =
Expand Down Expand Up @@ -274,7 +278,10 @@ def trigger_composer_helper(content)
end

context "when suggesting the tags with AI tag suggester" do
before { SiteSetting.ai_embeddings_enabled = true }
before do
SiteSetting.ai_embeddings_selected_model = embedding_definition.id
SiteSetting.ai_embeddings_enabled = true
end

it "updates the tag with the suggested tag" do
response =
Expand Down
11 changes: 9 additions & 2 deletions spec/system/ai_helper/ai_split_topic_suggestion_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
fab!(:cloud) { Fabricate(:tag) }
fab!(:feedback) { Fabricate(:tag) }
fab!(:review) { Fabricate(:tag) }
fab!(:embedding_definition)

before do
Group.find_by(id: Group::AUTO_GROUPS[:admins]).add(user)
Expand Down Expand Up @@ -80,7 +81,10 @@ def open_move_topic_modal
end

context "when suggesting categories with AI category suggester" do
before { SiteSetting.ai_embeddings_enabled = true }
before do
SiteSetting.ai_embeddings_selected_model = embedding_definition.id
SiteSetting.ai_embeddings_enabled = true
end

skip "TODO: Category suggester only loading one category in test" do
it "updates the category with the suggested category" do
Expand Down Expand Up @@ -108,7 +112,10 @@ def open_move_topic_modal
end

context "when suggesting tags with AI tag suggester" do
before { SiteSetting.ai_embeddings_enabled = true }
before do
SiteSetting.ai_embeddings_selected_model = embedding_definition.id
SiteSetting.ai_embeddings_enabled = true
end

it "update the tag with the suggested tag" do
response =
Expand Down