Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 918f6c4

Browse files
committed
FEATURE: Seamless embedding model upgrades
1 parent ab5edae commit 918f6c4

File tree

5 files changed

+143
-108
lines changed

5 files changed

+143
-108
lines changed

app/jobs/scheduled/embeddings_backfill.rb

Lines changed: 107 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -18,105 +18,116 @@ def execute(args)
1818
)
1919
end
2020

21-
rebaked = 0
22-
23-
vector = DiscourseAi::Embeddings::Vector.instance
24-
vector_def = vector.vdef
25-
table_name = DiscourseAi::Embeddings::Schema::TOPICS_TABLE
26-
27-
topics =
28-
Topic
29-
.joins(
30-
"LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id AND #{table_name}.model_id = #{vector_def.id}",
21+
production_vector = DiscourseAi::Embeddings::Vector.instance
22+
production_vector_def = production_vector.vdef
23+
24+
if SiteSetting.ai_embeddings_backfill_model.present? &&
25+
SiteSetting.ai_embeddings_backfill_model != SiteSetting.ai_embeddings_selected_model
26+
backfill_vector =
27+
DiscourseAi::Embeddings::Vector.new(
28+
EmbeddingDefinition.find_by(id: SiteSetting.ai_embeddings_backfill_model),
3129
)
32-
.where(archetype: Archetype.default)
33-
.where(deleted_at: nil)
34-
.order("topics.bumped_at DESC")
35-
36-
rebaked += populate_topic_embeddings(vector, topics.limit(limit - rebaked))
37-
38-
return if rebaked >= limit
39-
40-
# Then, we'll try to backfill embeddings for topics that have outdated
41-
# embeddings, be it model or strategy version
42-
relation = topics.where(<<~SQL).limit(limit - rebaked)
43-
#{table_name}.model_version < #{vector_def.version}
44-
OR
45-
#{table_name}.strategy_version < #{vector_def.strategy_version}
46-
SQL
47-
48-
rebaked += populate_topic_embeddings(vector, relation, force: true)
49-
50-
return if rebaked >= limit
30+
backfill_vector_def = backfill_vector.vdef
31+
end
5132

52-
# Finally, we'll try to backfill embeddings for topics that have outdated
53-
# embeddings due to edits or new replies. Here we only do 10% of the limit
54-
relation =
55-
topics
56-
.where("#{table_name}.updated_at < ?", 6.hours.ago)
57-
.where("#{table_name}.updated_at < topics.updated_at")
33+
topic_work_list = []
34+
topic_work_list << [production_vector, production_vector_def]
35+
topic_work_list << [backfill_vector, backfill_vector_def] if backfill_vector
36+
37+
topic_work_list.each do |vector, vector_def|
38+
rebaked = 0
39+
table_name = DiscourseAi::Embeddings::Schema::TOPICS_TABLE
40+
41+
topics =
42+
Topic
43+
.joins(
44+
"LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id AND #{table_name}.model_id = #{vector_def.id}",
45+
)
46+
.where(archetype: Archetype.default)
47+
.where(deleted_at: nil)
48+
.order("topics.bumped_at DESC")
49+
50+
rebaked += populate_topic_embeddings(vector, topics.limit(limit - rebaked))
51+
52+
next if rebaked >= limit
53+
54+
# Then, we'll try to backfill embeddings for topics that have outdated
55+
# embeddings, be it model or strategy version
56+
relation = topics.where(<<~SQL).limit(limit - rebaked)
57+
#{table_name}.model_version < #{vector_def.version}
58+
OR
59+
#{table_name}.strategy_version < #{vector_def.strategy_version}
60+
SQL
61+
62+
rebaked += populate_topic_embeddings(vector, relation, force: true)
63+
64+
next if rebaked >= limit
65+
66+
# Finally, we'll try to backfill embeddings for topics that have outdated
67+
# embeddings due to edits or new replies. Here we only do 10% of the limit
68+
relation =
69+
topics
70+
.where("#{table_name}.updated_at < ?", 6.hours.ago)
71+
.where("#{table_name}.updated_at < topics.updated_at")
72+
.limit((limit - rebaked) / 10)
73+
74+
populate_topic_embeddings(vector, relation, force: true)
75+
76+
next unless SiteSetting.ai_embeddings_per_post_enabled
77+
78+
# Now for posts
79+
table_name = DiscourseAi::Embeddings::Schema::POSTS_TABLE
80+
posts_batch_size = 1000
81+
82+
posts =
83+
Post
84+
.joins(
85+
"LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id AND #{table_name}.model_id = #{vector_def.id}",
86+
)
87+
.where(deleted_at: nil)
88+
.where(post_type: Post.types[:regular])
89+
90+
# First, we'll try to backfill embeddings for posts that have none
91+
posts
92+
.where("#{table_name}.post_id IS NULL")
93+
.limit(limit - rebaked)
94+
.pluck(:id)
95+
.each_slice(posts_batch_size) do |batch|
96+
vector.gen_bulk_reprensentations(Post.where(id: batch))
97+
rebaked += batch.length
98+
end
99+
100+
next if rebaked >= limit
101+
102+
# Then, we'll try to backfill embeddings for posts that have outdated
103+
# embeddings, be it model or strategy version
104+
posts
105+
.where(<<~SQL)
106+
#{table_name}.model_version < #{vector_def.version}
107+
OR
108+
#{table_name}.strategy_version < #{vector_def.strategy_version}
109+
SQL
110+
.limit(limit - rebaked)
111+
.pluck(:id)
112+
.each_slice(posts_batch_size) do |batch|
113+
vector.gen_bulk_reprensentations(Post.where(id: batch))
114+
rebaked += batch.length
115+
end
116+
117+
next if rebaked >= limit
118+
119+
# Finally, we'll try to backfill embeddings for posts that have outdated
120+
# embeddings due to edits. Here we only do 10% of the limit
121+
posts
122+
.where("#{table_name}.updated_at < ?", 7.days.ago)
123+
.order("random()")
58124
.limit((limit - rebaked) / 10)
59-
60-
populate_topic_embeddings(vector, relation, force: true)
61-
62-
return if rebaked >= limit
63-
64-
return unless SiteSetting.ai_embeddings_per_post_enabled
65-
66-
# Now for posts
67-
table_name = DiscourseAi::Embeddings::Schema::POSTS_TABLE
68-
posts_batch_size = 1000
69-
70-
posts =
71-
Post
72-
.joins(
73-
"LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id AND #{table_name}.model_id = #{vector_def.id}",
74-
)
75-
.where(deleted_at: nil)
76-
.where(post_type: Post.types[:regular])
77-
78-
# First, we'll try to backfill embeddings for posts that have none
79-
posts
80-
.where("#{table_name}.post_id IS NULL")
81-
.limit(limit - rebaked)
82-
.pluck(:id)
83-
.each_slice(posts_batch_size) do |batch|
84-
vector.gen_bulk_reprensentations(Post.where(id: batch))
85-
rebaked += batch.length
86-
end
87-
88-
return if rebaked >= limit
89-
90-
# Then, we'll try to backfill embeddings for posts that have outdated
91-
# embeddings, be it model or strategy version
92-
posts
93-
.where(<<~SQL)
94-
#{table_name}.model_version < #{vector_def.version}
95-
OR
96-
#{table_name}.strategy_version < #{vector_def.strategy_version}
97-
SQL
98-
.limit(limit - rebaked)
99-
.pluck(:id)
100-
.each_slice(posts_batch_size) do |batch|
101-
vector.gen_bulk_reprensentations(Post.where(id: batch))
102-
rebaked += batch.length
103-
end
104-
105-
return if rebaked >= limit
106-
107-
# Finally, we'll try to backfill embeddings for posts that have outdated
108-
# embeddings due to edits. Here we only do 10% of the limit
109-
posts
110-
.where("#{table_name}.updated_at < ?", 7.days.ago)
111-
.order("random()")
112-
.limit((limit - rebaked) / 10)
113-
.pluck(:id)
114-
.each_slice(posts_batch_size) do |batch|
115-
vector.gen_bulk_reprensentations(Post.where(id: batch))
116-
rebaked += batch.length
117-
end
118-
119-
rebaked
125+
.pluck(:id)
126+
.each_slice(posts_batch_size) do |batch|
127+
vector.gen_bulk_reprensentations(Post.where(id: batch))
128+
rebaked += batch.length
129+
end
130+
end
120131
end
121132

122133
private

config/settings.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,20 +230,26 @@ discourse_ai:
230230
enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
231231
validator: "DiscourseAi::Configuration::EmbeddingDefsValidator"
232232
area: "ai-features/embeddings"
233+
ai_embeddings_backfill_model:
234+
type: enum
235+
default: ""
236+
allow_any: false
237+
enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
238+
hidden: true
233239
ai_embeddings_per_post_enabled:
234240
default: false
235241
hidden: true
236-
ai_embeddings_generate_for_pms:
242+
ai_embeddings_generate_for_pms:
237243
default: false
238244
area: "ai-features/embeddings"
239245
ai_embeddings_semantic_related_topics_enabled:
240246
default: false
241247
client: true
242248
area: "ai-features/embeddings"
243-
ai_embeddings_semantic_related_topics:
249+
ai_embeddings_semantic_related_topics:
244250
default: 5
245251
area: "ai-features/embeddings"
246-
ai_embeddings_semantic_related_include_closed_topics:
252+
ai_embeddings_semantic_related_include_closed_topics:
247253
default: true
248254
area: "ai-features/embeddings"
249255
ai_embeddings_backfill_batch_size:

lib/embeddings/schema.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@ class Schema
2020
MissingEmbeddingError = Class.new(StandardError)
2121

2222
class << self
23-
def for(target_klass)
24-
vector_def = EmbeddingDefinition.find_by(id: SiteSetting.ai_embeddings_selected_model)
23+
def for(target_klass, vector_def: nil)
24+
vector_def =
25+
EmbeddingDefinition.find_by(
26+
id: SiteSetting.ai_embeddings_selected_model,
27+
) if vector_def.nil?
2528
raise "Invalid embeddings selected model" if vector_def.nil?
2629

2730
case target_klass&.name

lib/embeddings/vector.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def gen_bulk_reprensentations(relation)
2525
idletime: 30,
2626
)
2727

28-
schema = DiscourseAi::Embeddings::Schema.for(relation.first.class)
28+
schema = DiscourseAi::Embeddings::Schema.for(relation.first.class, vector_def: @vdef)
2929

3030
embedding_gen = vdef.inference_client
3131
promised_embeddings =
@@ -58,7 +58,7 @@ def generate_representation_from(target)
5858
text = vdef.prepare_target_text(target)
5959
return if text.blank?
6060

61-
schema = DiscourseAi::Embeddings::Schema.for(target.class)
61+
schema = DiscourseAi::Embeddings::Schema.for(target.class, vector_def: @vdef)
6262

6363
new_digest = OpenSSL::Digest::SHA1.hexdigest(text)
6464
return if schema.find_by_target(target)&.digest == new_digest

spec/jobs/scheduled/embeddings_backfill_spec.rb

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,23 @@
2020
end
2121

2222
fab!(:vector_def) { Fabricate(:embedding_definition) }
23+
fab!(:vector_def2) { Fabricate(:embedding_definition) }
24+
fab!(:embedding_array) { Array.new(1024) { 1 } }
2325

2426
before do
2527
SiteSetting.ai_embeddings_selected_model = vector_def.id
2628
SiteSetting.ai_embeddings_enabled = true
2729
SiteSetting.ai_embeddings_backfill_batch_size = 1
2830
SiteSetting.ai_embeddings_per_post_enabled = true
2931
Jobs.run_immediately!
30-
end
31-
32-
it "backfills topics based on bumped_at date" do
33-
embedding = Array.new(1024) { 1 }
3432

3533
WebMock.stub_request(:post, "https://test.com/embeddings").to_return(
3634
status: 200,
37-
body: JSON.dump(embedding),
35+
body: JSON.dump(embedding_array),
3836
)
37+
end
3938

39+
it "backfills topics based on bumped_at date" do
4040
Jobs::EmbeddingsBackfill.new.execute({})
4141

4242
topic_ids =
@@ -68,4 +68,19 @@
6868

6969
expect(index_date).to be_within_one_second_of(Time.zone.now)
7070
end
71+
72+
it "backfills topics based on bumped_at date" do
73+
SiteSetting.ai_embeddings_backfill_model = vector_def2.id
74+
SiteSetting.ai_embeddings_backfill_batch_size = 100
75+
76+
Jobs::EmbeddingsBackfill.new.execute({})
77+
78+
topic_ids =
79+
DB.query_single(
80+
"SELECT topic_id from #{DiscourseAi::Embeddings::Schema::TOPICS_TABLE} WHERE model_id = ?",
81+
vector_def2.id,
82+
)
83+
84+
expect(topic_ids).to contain_exactly(first_topic.id, second_topic.id, third_topic.id)
85+
end
7186
end

0 commit comments

Comments
 (0)