Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 6247906

Browse files
authored
FEATURE: Seamless embedding model upgrades (#1486)
1 parent ab5edae commit 6247906

File tree

6 files changed

+152
-109
lines changed

6 files changed

+152
-109
lines changed

app/jobs/scheduled/embeddings_backfill.rb

Lines changed: 105 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -18,105 +18,115 @@ def execute(args)
1818
)
1919
end
2020

21-
rebaked = 0
21+
production_vector = DiscourseAi::Embeddings::Vector.instance
2222

23-
vector = DiscourseAi::Embeddings::Vector.instance
24-
vector_def = vector.vdef
25-
table_name = DiscourseAi::Embeddings::Schema::TOPICS_TABLE
26-
27-
topics =
28-
Topic
29-
.joins(
30-
"LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id AND #{table_name}.model_id = #{vector_def.id}",
23+
if SiteSetting.ai_embeddings_backfill_model.present? &&
24+
SiteSetting.ai_embeddings_backfill_model != SiteSetting.ai_embeddings_selected_model
25+
backfill_vector =
26+
DiscourseAi::Embeddings::Vector.new(
27+
EmbeddingDefinition.find_by(id: SiteSetting.ai_embeddings_backfill_model),
3128
)
32-
.where(archetype: Archetype.default)
33-
.where(deleted_at: nil)
34-
.order("topics.bumped_at DESC")
35-
36-
rebaked += populate_topic_embeddings(vector, topics.limit(limit - rebaked))
37-
38-
return if rebaked >= limit
39-
40-
# Then, we'll try to backfill embeddings for topics that have outdated
41-
# embeddings, be it model or strategy version
42-
relation = topics.where(<<~SQL).limit(limit - rebaked)
43-
#{table_name}.model_version < #{vector_def.version}
44-
OR
45-
#{table_name}.strategy_version < #{vector_def.strategy_version}
46-
SQL
47-
48-
rebaked += populate_topic_embeddings(vector, relation, force: true)
49-
50-
return if rebaked >= limit
29+
end
5130

52-
# Finally, we'll try to backfill embeddings for topics that have outdated
53-
# embeddings due to edits or new replies. Here we only do 10% of the limit
54-
relation =
55-
topics
56-
.where("#{table_name}.updated_at < ?", 6.hours.ago)
57-
.where("#{table_name}.updated_at < topics.updated_at")
31+
topic_work_list = []
32+
topic_work_list << production_vector
33+
topic_work_list << backfill_vector if backfill_vector
34+
35+
topic_work_list.each do |vector|
36+
rebaked = 0
37+
table_name = DiscourseAi::Embeddings::Schema::TOPICS_TABLE
38+
vector_def = vector.vdef
39+
40+
topics =
41+
Topic
42+
.joins(
43+
"LEFT JOIN #{table_name} ON #{table_name}.topic_id = topics.id AND #{table_name}.model_id = #{vector_def.id}",
44+
)
45+
.where(archetype: Archetype.default)
46+
.where(deleted_at: nil)
47+
.order("topics.bumped_at DESC")
48+
49+
rebaked += populate_topic_embeddings(vector, topics.limit(limit - rebaked))
50+
51+
next if rebaked >= limit
52+
53+
# Then, we'll try to backfill embeddings for topics that have outdated
54+
# embeddings, be it model or strategy version
55+
relation = topics.where(<<~SQL).limit(limit - rebaked)
56+
#{table_name}.model_version < #{vector_def.version}
57+
OR
58+
#{table_name}.strategy_version < #{vector_def.strategy_version}
59+
SQL
60+
61+
rebaked += populate_topic_embeddings(vector, relation, force: true)
62+
63+
next if rebaked >= limit
64+
65+
# Finally, we'll try to backfill embeddings for topics that have outdated
66+
# embeddings due to edits or new replies. Here we only do 10% of the limit
67+
relation =
68+
topics
69+
.where("#{table_name}.updated_at < ?", 6.hours.ago)
70+
.where("#{table_name}.updated_at < topics.updated_at")
71+
.limit((limit - rebaked) / 10)
72+
73+
populate_topic_embeddings(vector, relation, force: true)
74+
75+
next unless SiteSetting.ai_embeddings_per_post_enabled
76+
77+
# Now for posts
78+
table_name = DiscourseAi::Embeddings::Schema::POSTS_TABLE
79+
posts_batch_size = 1000
80+
81+
posts =
82+
Post
83+
.joins(
84+
"LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id AND #{table_name}.model_id = #{vector_def.id}",
85+
)
86+
.where(deleted_at: nil)
87+
.where(post_type: Post.types[:regular])
88+
89+
# First, we'll try to backfill embeddings for posts that have none
90+
posts
91+
.where("#{table_name}.post_id IS NULL")
92+
.limit(limit - rebaked)
93+
.pluck(:id)
94+
.each_slice(posts_batch_size) do |batch|
95+
vector.gen_bulk_reprensentations(Post.where(id: batch))
96+
rebaked += batch.length
97+
end
98+
99+
next if rebaked >= limit
100+
101+
# Then, we'll try to backfill embeddings for posts that have outdated
102+
# embeddings, be it model or strategy version
103+
posts
104+
.where(<<~SQL)
105+
#{table_name}.model_version < #{vector_def.version}
106+
OR
107+
#{table_name}.strategy_version < #{vector_def.strategy_version}
108+
SQL
109+
.limit(limit - rebaked)
110+
.pluck(:id)
111+
.each_slice(posts_batch_size) do |batch|
112+
vector.gen_bulk_reprensentations(Post.where(id: batch))
113+
rebaked += batch.length
114+
end
115+
116+
next if rebaked >= limit
117+
118+
# Finally, we'll try to backfill embeddings for posts that have outdated
119+
# embeddings due to edits. Here we only do 10% of the limit
120+
posts
121+
.where("#{table_name}.updated_at < ?", 7.days.ago)
122+
.order("random()")
58123
.limit((limit - rebaked) / 10)
59-
60-
populate_topic_embeddings(vector, relation, force: true)
61-
62-
return if rebaked >= limit
63-
64-
return unless SiteSetting.ai_embeddings_per_post_enabled
65-
66-
# Now for posts
67-
table_name = DiscourseAi::Embeddings::Schema::POSTS_TABLE
68-
posts_batch_size = 1000
69-
70-
posts =
71-
Post
72-
.joins(
73-
"LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id AND #{table_name}.model_id = #{vector_def.id}",
74-
)
75-
.where(deleted_at: nil)
76-
.where(post_type: Post.types[:regular])
77-
78-
# First, we'll try to backfill embeddings for posts that have none
79-
posts
80-
.where("#{table_name}.post_id IS NULL")
81-
.limit(limit - rebaked)
82-
.pluck(:id)
83-
.each_slice(posts_batch_size) do |batch|
84-
vector.gen_bulk_reprensentations(Post.where(id: batch))
85-
rebaked += batch.length
86-
end
87-
88-
return if rebaked >= limit
89-
90-
# Then, we'll try to backfill embeddings for posts that have outdated
91-
# embeddings, be it model or strategy version
92-
posts
93-
.where(<<~SQL)
94-
#{table_name}.model_version < #{vector_def.version}
95-
OR
96-
#{table_name}.strategy_version < #{vector_def.strategy_version}
97-
SQL
98-
.limit(limit - rebaked)
99-
.pluck(:id)
100-
.each_slice(posts_batch_size) do |batch|
101-
vector.gen_bulk_reprensentations(Post.where(id: batch))
102-
rebaked += batch.length
103-
end
104-
105-
return if rebaked >= limit
106-
107-
# Finally, we'll try to backfill embeddings for posts that have outdated
108-
# embeddings due to edits. Here we only do 10% of the limit
109-
posts
110-
.where("#{table_name}.updated_at < ?", 7.days.ago)
111-
.order("random()")
112-
.limit((limit - rebaked) / 10)
113-
.pluck(:id)
114-
.each_slice(posts_batch_size) do |batch|
115-
vector.gen_bulk_reprensentations(Post.where(id: batch))
116-
rebaked += batch.length
117-
end
118-
119-
rebaked
124+
.pluck(:id)
125+
.each_slice(posts_batch_size) do |batch|
126+
vector.gen_bulk_reprensentations(Post.where(id: batch))
127+
rebaked += batch.length
128+
end
129+
end
120130
end
121131

122132
private

config/settings.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,20 +230,26 @@ discourse_ai:
230230
enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
231231
validator: "DiscourseAi::Configuration::EmbeddingDefsValidator"
232232
area: "ai-features/embeddings"
233+
ai_embeddings_backfill_model:
234+
type: enum
235+
default: ""
236+
allow_any: false
237+
enum: "DiscourseAi::Configuration::EmbeddingDefsEnumerator"
238+
hidden: true
233239
ai_embeddings_per_post_enabled:
234240
default: false
235241
hidden: true
236-
ai_embeddings_generate_for_pms:
242+
ai_embeddings_generate_for_pms:
237243
default: false
238244
area: "ai-features/embeddings"
239245
ai_embeddings_semantic_related_topics_enabled:
240246
default: false
241247
client: true
242248
area: "ai-features/embeddings"
243-
ai_embeddings_semantic_related_topics:
249+
ai_embeddings_semantic_related_topics:
244250
default: 5
245251
area: "ai-features/embeddings"
246-
ai_embeddings_semantic_related_include_closed_topics:
252+
ai_embeddings_semantic_related_include_closed_topics:
247253
default: true
248254
area: "ai-features/embeddings"
249255
ai_embeddings_backfill_batch_size:

lib/embeddings/schema.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,11 @@ class Schema
2020
MissingEmbeddingError = Class.new(StandardError)
2121

2222
class << self
23-
def for(target_klass)
24-
vector_def = EmbeddingDefinition.find_by(id: SiteSetting.ai_embeddings_selected_model)
23+
def for(target_klass, vector_def: nil)
24+
vector_def =
25+
EmbeddingDefinition.find_by(
26+
id: SiteSetting.ai_embeddings_selected_model,
27+
) if vector_def.nil?
2528
raise "Invalid embeddings selected model" if vector_def.nil?
2629

2730
case target_klass&.name

lib/embeddings/semantic_related.rb

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
module DiscourseAi
44
module Embeddings
55
class SemanticRelated
6+
CACHE_PREFIX = "semantic-suggested-topic-"
7+
68
def self.clear_cache_for(topic)
79
Discourse.cache.delete("semantic-suggested-topic-#{topic.id}")
810
Discourse.redis.del("build-semantic-suggested-topic-#{topic.id}")
@@ -79,14 +81,21 @@ def self.related_topics_for_crawler(controller)
7981
)
8082
end
8183

84+
def self.clear_cache!
85+
Discourse
86+
.cache
87+
.keys("#{CACHE_PREFIX}*")
88+
.each { |key| Discourse.cache.delete(key.split(":").last) }
89+
end
90+
8291
private
8392

8493
def semantic_suggested_key(topic_id)
85-
"semantic-suggested-topic-#{topic_id}"
94+
"#{CACHE_PREFIX}#{topic_id}"
8695
end
8796

8897
def build_semantic_suggested_key(topic_id)
89-
"build-semantic-suggested-topic-#{topic_id}"
98+
"build-#{CACHE_PREFIX}#{topic_id}"
9099
end
91100
end
92101
end

lib/embeddings/vector.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def gen_bulk_reprensentations(relation)
2525
idletime: 30,
2626
)
2727

28-
schema = DiscourseAi::Embeddings::Schema.for(relation.first.class)
28+
schema = DiscourseAi::Embeddings::Schema.for(relation.first.class, vector_def: @vdef)
2929

3030
embedding_gen = vdef.inference_client
3131
promised_embeddings =
@@ -58,7 +58,7 @@ def generate_representation_from(target)
5858
text = vdef.prepare_target_text(target)
5959
return if text.blank?
6060

61-
schema = DiscourseAi::Embeddings::Schema.for(target.class)
61+
schema = DiscourseAi::Embeddings::Schema.for(target.class, vector_def: @vdef)
6262

6363
new_digest = OpenSSL::Digest::SHA1.hexdigest(text)
6464
return if schema.find_by_target(target)&.digest == new_digest

spec/jobs/scheduled/embeddings_backfill_spec.rb

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,23 @@
2020
end
2121

2222
fab!(:vector_def) { Fabricate(:embedding_definition) }
23+
fab!(:vector_def2) { Fabricate(:embedding_definition) }
24+
fab!(:embedding_array) { Array.new(1024) { 1 } }
2325

2426
before do
2527
SiteSetting.ai_embeddings_selected_model = vector_def.id
2628
SiteSetting.ai_embeddings_enabled = true
2729
SiteSetting.ai_embeddings_backfill_batch_size = 1
2830
SiteSetting.ai_embeddings_per_post_enabled = true
2931
Jobs.run_immediately!
30-
end
31-
32-
it "backfills topics based on bumped_at date" do
33-
embedding = Array.new(1024) { 1 }
3432

3533
WebMock.stub_request(:post, "https://test.com/embeddings").to_return(
3634
status: 200,
37-
body: JSON.dump(embedding),
35+
body: JSON.dump(embedding_array),
3836
)
37+
end
3938

39+
it "backfills topics based on bumped_at date" do
4040
Jobs::EmbeddingsBackfill.new.execute({})
4141

4242
topic_ids =
@@ -68,4 +68,19 @@
6868

6969
expect(index_date).to be_within_one_second_of(Time.zone.now)
7070
end
71+
72+
it "backfills embeddings for the ai_embeddings_backfill_model" do
73+
SiteSetting.ai_embeddings_backfill_model = vector_def2.id
74+
SiteSetting.ai_embeddings_backfill_batch_size = 100
75+
76+
Jobs::EmbeddingsBackfill.new.execute({})
77+
78+
topic_ids =
79+
DB.query_single(
80+
"SELECT topic_id from #{DiscourseAi::Embeddings::Schema::TOPICS_TABLE} WHERE model_id = ?",
81+
vector_def2.id,
82+
)
83+
84+
expect(topic_ids).to contain_exactly(first_topic.id, second_topic.id, third_topic.id)
85+
end
7186
end

0 commit comments

Comments
 (0)