Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit a562a5e

Browse files
committed
Generate search bit index on the fly. cleanup orphaned data
1 parent 8bac20a commit a562a5e

File tree

10 files changed

+258
-15
lines changed

10 files changed

+258
-15
lines changed

app/controllers/discourse_ai/admin/ai_embeddings_controller.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def create
4949
def update
5050
embedding_def = EmbeddingDefinition.find(params[:id])
5151

52-
if embedding_def.update(ai_embeddings_params)
52+
if embedding_def.update(ai_embeddings_params.except(:dimensions))
5353
render json: AiEmbeddingDefinitionSerializer.new(embedding_def)
5454
else
5555
render_json_error embedding_def
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# frozen_string_literal: true
2+
3+
module ::Jobs
4+
class ManageEmbeddingDefSearchIndex < ::Jobs::Base
5+
def execute(args)
6+
embedding_def = EmbeddingDefinition.find_by(id: args[:id])
7+
return if embedding_def.nil?
8+
return if DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def)
9+
10+
DiscourseAi::Embeddings::Schema.prepare_search_indexes(embedding_def)
11+
end
12+
end
13+
end
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# frozen_string_literal: true
2+
3+
module Jobs
4+
class RemoveOrphanedEmbeddings < ::Jobs::Scheduled
5+
every 1.week
6+
7+
def execute(_args)
8+
DiscourseAi::Embeddings::Schema.remove_orphaned_data
9+
end
10+
end
11+
end

app/models/embedding_definition.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ def presets
121121
validates :tokenizer_class, presence: true, inclusion: tokenizer_names
122122
validates_presence_of :url, :api_key, :dimensions, :max_sequence_length, :pg_function
123123

124+
after_create :create_indexes
125+
126+
def create_indexes
127+
Jobs.enqueue(:manage_embedding_def_search_index, id: self.id)
128+
end
129+
124130
def tokenizer
125131
tokenizer_class.constantize
126132
end

assets/javascripts/discourse/components/ai-embedding-editor.gjs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ import { later } from "@ember/runloop";
1010
import { service } from "@ember/service";
1111
import BackButton from "discourse/components/back-button";
1212
import DButton from "discourse/components/d-button";
13+
import icon from "discourse/helpers/d-icon";
1314
import { popupAjaxError } from "discourse/lib/ajax-error";
14-
import icon from "discourse-common/helpers/d-icon";
1515
import { i18n } from "discourse-i18n";
1616
import ComboBox from "select-kit/components/combo-box";
17+
import DTooltip from "float-kit/components/d-tooltip";
18+
import not from "truth-helpers/helpers/not";
1719

1820
export default class AiEmbeddingEditor extends Component {
1921
@service toasts;
@@ -276,7 +278,16 @@ export default class AiEmbeddingEditor extends Component {
276278
lang="en"
277279
@value={{this.editingModel.dimensions}}
278280
required="true"
281+
disabled={{not this.editingModel.isNew}}
279282
/>
283+
{{#if this.editingModel.isNew}}
284+
<DTooltip
285+
@icon="circle-exclamation"
286+
@content={{i18n
287+
"discourse_ai.embeddings.hints.dimensions_warning"
288+
}}
289+
/>
290+
{{/if}}
280291
</div>
281292

282293
<div class="control-group">

config/locales/client.en.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,8 @@ en:
519519
running: "Running test..."
520520
success: "Success!"
521521
failure: "Attempting to generate an embedding resulted in: %{error}"
522+
hints:
523+
dimensions_warning: "Once saved, this value can't be changed."
522524

523525
display_name: "Name"
524526
provider: "Provider"

lib/embeddings/schema.rb

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,80 @@ class Schema
1212
POSTS_TABLE = "ai_posts_embeddings"
1313
RAG_DOCS_TABLE = "ai_document_fragments_embeddings"
1414

15+
EMBEDDING_TARGETS = %w[topics posts document_fragments]
16+
EMBEDDING_TABLES = [TOPICS_TABLE, POSTS_TABLE, RAG_DOCS_TABLE]
17+
1518
MissingEmbeddingError = Class.new(StandardError)
1619

17-
def self.for(target_klass)
18-
vector_def = EmbeddingDefinition.find_by(id: SiteSetting.ai_embeddings_selected_model)
19-
raise "Invalid embeddings selected model" if vector_def.nil?
20-
21-
case target_klass&.name
22-
when "Topic"
23-
new(TOPICS_TABLE, "topic_id", vector_def)
24-
when "Post"
25-
new(POSTS_TABLE, "post_id", vector_def)
26-
when "RagDocumentFragment"
27-
new(RAG_DOCS_TABLE, "rag_document_fragment_id", vector_def)
28-
else
29-
raise ArgumentError, "Invalid target type for embeddings"
20+
class << self
21+
def for(target_klass)
22+
vector_def = EmbeddingDefinition.find_by(id: SiteSetting.ai_embeddings_selected_model)
23+
raise "Invalid embeddings selected model" if vector_def.nil?
24+
25+
case target_klass&.name
26+
when "Topic"
27+
new(TOPICS_TABLE, "topic_id", vector_def)
28+
when "Post"
29+
new(POSTS_TABLE, "post_id", vector_def)
30+
when "RagDocumentFragment"
31+
new(RAG_DOCS_TABLE, "rag_document_fragment_id", vector_def)
32+
else
33+
raise ArgumentError, "Invalid target type for embeddings"
34+
end
35+
end
36+
37+
def search_index_name(table, def_id)
38+
"ai_#{table}_embeddings_#{def_id}_1_search_bit"
39+
end
40+
41+
def prepare_search_indexes(vector_def)
42+
EMBEDDING_TARGETS.each { |target| DB.exec <<~SQL }
43+
CREATE INDEX IF NOT EXISTS #{search_index_name(target, vector_def.id)} ON ai_#{target}_embeddings
44+
USING hnsw ((binary_quantize(embeddings)::bit(#{vector_def.dimensions})) bit_hamming_ops)
45+
WHERE model_id = #{vector_def.id} AND strategy_id = 1;
46+
SQL
47+
end
48+
49+
def correctly_indexed?(vector_def)
50+
index_names = EMBEDDING_TARGETS.map { |t| search_index_name(t, vector_def.id) }
51+
indexdefs =
52+
DB.query_single(
53+
"SELECT indexdef FROM pg_indexes WHERE indexname IN (:names)",
54+
names: index_names,
55+
)
56+
57+
return false if indexdefs.length < index_names.length
58+
59+
indexdefs.all? do |defs|
60+
defs.include? "(binary_quantize(embeddings))::bit(#{vector_def.dimensions})"
61+
end
62+
end
63+
64+
def remove_orphaned_data
65+
removed_defs_ids =
66+
DB.query_single(
67+
"SELECT DISTINCT(model_id) FROM #{TOPICS_TABLE} te LEFT JOIN embedding_definitions ed ON te.model_id = ed.id WHERE ed.id IS NULL",
68+
)
69+
70+
EMBEDDING_TABLES.each do |t|
71+
DB.exec(
72+
"DELETE FROM #{t} WHERE model_id IN (:removed_defs)",
73+
removed_defs: removed_defs_ids,
74+
)
75+
end
76+
77+
drop_index_statement =
78+
EMBEDDING_TARGETS
79+
.reduce([]) do |memo, et|
80+
removed_defs_ids.each do |rdi|
81+
memo << "DROP INDEX IF EXISTS #{search_index_name(et, rdi)};"
82+
end
83+
84+
memo
85+
end
86+
.join("\n")
87+
88+
DB.exec(drop_index_statement)
3089
end
3190
end
3291

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe Jobs::ManageEmbeddingDefSearchIndex do
4+
fab!(:embedding_definition)
5+
6+
describe "#execute" do
7+
context "when there is no embedding def" do
8+
it "does nothing" do
9+
invalid_id = 999_999_999
10+
11+
subject.execute(id: invalid_id)
12+
13+
expect(
14+
DiscourseAi::Embeddings::Schema.correctly_indexed?(
15+
EmbeddingDefinition.new(id: invalid_id),
16+
),
17+
).to eq(false)
18+
end
19+
end
20+
21+
context "when the embedding def is fresh" do
22+
it "creates the indexes" do
23+
subject.execute(id: embedding_definition.id)
24+
25+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition)).to eq(true)
26+
end
27+
28+
it "creates them only once" do
29+
subject.execute(id: embedding_definition.id)
30+
subject.execute(id: embedding_definition.id)
31+
32+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition)).to eq(true)
33+
end
34+
35+
context "when one of the idxs is missing" do
36+
it "automatically recovers by creating it" do
37+
DB.exec <<~SQL
38+
CREATE INDEX IF NOT EXISTS ai_topics_embeddings_#{embedding_definition.id}_1_search_bit ON ai_topics_embeddings
39+
USING hnsw ((binary_quantize(embeddings)::bit(#{embedding_definition.dimensions})) bit_hamming_ops)
40+
WHERE model_id = #{embedding_definition.id} AND strategy_id = 1;
41+
SQL
42+
43+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition)).to eq(
44+
false,
45+
)
46+
47+
subject.execute(id: embedding_definition.id)
48+
49+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition)).to eq(
50+
true,
51+
)
52+
end
53+
end
54+
end
55+
end
56+
end
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe Jobs::RemoveOrphanedEmbeddings do
4+
describe "#execute" do
5+
fab!(:embedding_definition)
6+
fab!(:embedding_definition_2) { Fabricate(:embedding_definition) }
7+
fab!(:topic)
8+
fab!(:post)
9+
10+
before do
11+
DiscourseAi::Embeddings::Schema.prepare_search_indexes(embedding_definition)
12+
DiscourseAi::Embeddings::Schema.prepare_search_indexes(embedding_definition_2)
13+
14+
# Seed embeddings. One of each def x target classes.
15+
[embedding_definition, embedding_definition_2].each do |edef|
16+
SiteSetting.ai_embeddings_selected_model = edef.id
17+
18+
[topic, post].each do |target|
19+
schema = DiscourseAi::Embeddings::Schema.for(target.class)
20+
schema.store(target, [1] * edef.dimensions, "test")
21+
end
22+
end
23+
24+
embedding_definition.destroy!
25+
end
26+
27+
def find_all_embeddings_of(target, table, target_column)
28+
DB.query_single("SELECT model_id FROM #{table} WHERE #{target_column} = #{target.id}")
29+
end
30+
31+
it "delete embeddings without an existing embedding definition" do
32+
expect(find_all_embeddings_of(post, "ai_posts_embeddings", "post_id")).to contain_exactly(
33+
embedding_definition.id,
34+
embedding_definition_2.id,
35+
)
36+
expect(find_all_embeddings_of(topic, "ai_topics_embeddings", "topic_id")).to contain_exactly(
37+
embedding_definition.id,
38+
embedding_definition_2.id,
39+
)
40+
41+
subject.execute({})
42+
43+
expect(find_all_embeddings_of(topic, "ai_topics_embeddings", "topic_id")).to contain_exactly(
44+
embedding_definition_2.id,
45+
)
46+
expect(find_all_embeddings_of(post, "ai_posts_embeddings", "post_id")).to contain_exactly(
47+
embedding_definition_2.id,
48+
)
49+
end
50+
51+
it "deletes orphaned indexes" do
52+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition)).to eq(true)
53+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition_2)).to eq(true)
54+
55+
subject.execute({})
56+
57+
index_names =
58+
DiscourseAi::Embeddings::Schema::EMBEDDING_TARGETS.map do |t|
59+
"ai_#{t}_embeddings_#{embedding_definition.id}_1_search_bit"
60+
end
61+
indexnames =
62+
DB.query_single(
63+
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
64+
names: index_names,
65+
)
66+
67+
expect(indexnames).to be_empty
68+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_definition_2)).to eq(true)
69+
end
70+
end
71+
end

spec/requests/admin/ai_embeddings_controller_spec.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,20 @@
9595

9696
expect(response.status).to eq(404)
9797
end
98+
99+
it "doesn't allow dimenstions to be updated" do
100+
new_dimensions = 200
101+
102+
put "/admin/plugins/discourse-ai/ai-embeddings/#{embedding_definition.id}.json",
103+
params: {
104+
ai_embedding: {
105+
dimensions: new_dimensions,
106+
},
107+
}
108+
109+
expect(response.status).to eq(200)
110+
expect(embedding_definition.reload.dimensions).not_to eq(new_dimensions)
111+
end
98112
end
99113

100114
context "with invalid update params" do

0 commit comments

Comments
 (0)