@@ -18,105 +18,115 @@ def execute(args)
1818 )
1919 end
2020
21- rebaked = 0
21+ production_vector = DiscourseAi :: Embeddings :: Vector . instance
2222
23- vector = DiscourseAi ::Embeddings ::Vector . instance
24- vector_def = vector . vdef
25- table_name = DiscourseAi ::Embeddings ::Schema ::TOPICS_TABLE
26-
27- topics =
28- Topic
29- . joins (
30- "LEFT JOIN #{ table_name } ON #{ table_name } .topic_id = topics.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
23+ if SiteSetting . ai_embeddings_backfill_model . present? &&
24+ SiteSetting . ai_embeddings_backfill_model != SiteSetting . ai_embeddings_selected_model
25+ backfill_vector =
26+ DiscourseAi ::Embeddings ::Vector . new (
27+ EmbeddingDefinition . find_by ( id : SiteSetting . ai_embeddings_backfill_model ) ,
3128 )
32- . where ( archetype : Archetype . default )
33- . where ( deleted_at : nil )
34- . order ( "topics.bumped_at DESC" )
35-
36- rebaked += populate_topic_embeddings ( vector , topics . limit ( limit - rebaked ) )
37-
38- return if rebaked >= limit
39-
40- # Then, we'll try to backfill embeddings for topics that have outdated
41- # embeddings, be it model or strategy version
42- relation = topics . where ( <<~SQL ) . limit ( limit - rebaked )
43- #{ table_name } .model_version < #{ vector_def . version }
44- OR
45- #{ table_name } .strategy_version < #{ vector_def . strategy_version }
46- SQL
47-
48- rebaked += populate_topic_embeddings ( vector , relation , force : true )
49-
50- return if rebaked >= limit
29+ end
5130
52- # Finally, we'll try to backfill embeddings for topics that have outdated
53- # embeddings due to edits or new replies. Here we only do 10% of the limit
54- relation =
55- topics
56- . where ( "#{ table_name } .updated_at < ?" , 6 . hours . ago )
57- . where ( "#{ table_name } .updated_at < topics.updated_at" )
31+ topic_work_list = [ ]
32+ topic_work_list << production_vector
33+ topic_work_list << backfill_vector if backfill_vector
34+
35+ topic_work_list . each do |vector |
36+ rebaked = 0
37+ table_name = DiscourseAi ::Embeddings ::Schema ::TOPICS_TABLE
38+ vector_def = vector . vdef
39+
40+ topics =
41+ Topic
42+ . joins (
43+ "LEFT JOIN #{ table_name } ON #{ table_name } .topic_id = topics.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
44+ )
45+ . where ( archetype : Archetype . default )
46+ . where ( deleted_at : nil )
47+ . order ( "topics.bumped_at DESC" )
48+
49+ rebaked += populate_topic_embeddings ( vector , topics . limit ( limit - rebaked ) )
50+
51+ next if rebaked >= limit
52+
53+ # Then, we'll try to backfill embeddings for topics that have outdated
54+ # embeddings, be it model or strategy version
55+ relation = topics . where ( <<~SQL ) . limit ( limit - rebaked )
56+ #{ table_name } .model_version < #{ vector_def . version }
57+ OR
58+ #{ table_name } .strategy_version < #{ vector_def . strategy_version }
59+ SQL
60+
61+ rebaked += populate_topic_embeddings ( vector , relation , force : true )
62+
63+ next if rebaked >= limit
64+
65+ # Finally, we'll try to backfill embeddings for topics that have outdated
66+ # embeddings due to edits or new replies. Here we only do 10% of the limit
67+ relation =
68+ topics
69+ . where ( "#{ table_name } .updated_at < ?" , 6 . hours . ago )
70+ . where ( "#{ table_name } .updated_at < topics.updated_at" )
71+ . limit ( ( limit - rebaked ) / 10 )
72+
73+ populate_topic_embeddings ( vector , relation , force : true )
74+
75+ next unless SiteSetting . ai_embeddings_per_post_enabled
76+
77+ # Now for posts
78+ table_name = DiscourseAi ::Embeddings ::Schema ::POSTS_TABLE
79+ posts_batch_size = 1000
80+
81+ posts =
82+ Post
83+ . joins (
84+ "LEFT JOIN #{ table_name } ON #{ table_name } .post_id = posts.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
85+ )
86+ . where ( deleted_at : nil )
87+ . where ( post_type : Post . types [ :regular ] )
88+
89+ # First, we'll try to backfill embeddings for posts that have none
90+ posts
91+ . where ( "#{ table_name } .post_id IS NULL" )
92+ . limit ( limit - rebaked )
93+ . pluck ( :id )
94+ . each_slice ( posts_batch_size ) do |batch |
95+ vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
96+ rebaked += batch . length
97+ end
98+
99+ next if rebaked >= limit
100+
101+ # Then, we'll try to backfill embeddings for posts that have outdated
102+ # embeddings, be it model or strategy version
103+ posts
104+ . where ( <<~SQL )
105+ #{ table_name } .model_version < #{ vector_def . version }
106+ OR
107+ #{ table_name } .strategy_version < #{ vector_def . strategy_version }
108+ SQL
109+ . limit ( limit - rebaked )
110+ . pluck ( :id )
111+ . each_slice ( posts_batch_size ) do |batch |
112+ vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
113+ rebaked += batch . length
114+ end
115+
116+ next if rebaked >= limit
117+
118+ # Finally, we'll try to backfill embeddings for posts that have outdated
119+ # embeddings due to edits. Here we only do 10% of the limit
120+ posts
121+ . where ( "#{ table_name } .updated_at < ?" , 7 . days . ago )
122+ . order ( "random()" )
58123 . limit ( ( limit - rebaked ) / 10 )
59-
60- populate_topic_embeddings ( vector , relation , force : true )
61-
62- return if rebaked >= limit
63-
64- return unless SiteSetting . ai_embeddings_per_post_enabled
65-
66- # Now for posts
67- table_name = DiscourseAi ::Embeddings ::Schema ::POSTS_TABLE
68- posts_batch_size = 1000
69-
70- posts =
71- Post
72- . joins (
73- "LEFT JOIN #{ table_name } ON #{ table_name } .post_id = posts.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
74- )
75- . where ( deleted_at : nil )
76- . where ( post_type : Post . types [ :regular ] )
77-
78- # First, we'll try to backfill embeddings for posts that have none
79- posts
80- . where ( "#{ table_name } .post_id IS NULL" )
81- . limit ( limit - rebaked )
82- . pluck ( :id )
83- . each_slice ( posts_batch_size ) do |batch |
84- vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
85- rebaked += batch . length
86- end
87-
88- return if rebaked >= limit
89-
90- # Then, we'll try to backfill embeddings for posts that have outdated
91- # embeddings, be it model or strategy version
92- posts
93- . where ( <<~SQL )
94- #{ table_name } .model_version < #{ vector_def . version }
95- OR
96- #{ table_name } .strategy_version < #{ vector_def . strategy_version }
97- SQL
98- . limit ( limit - rebaked )
99- . pluck ( :id )
100- . each_slice ( posts_batch_size ) do |batch |
101- vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
102- rebaked += batch . length
103- end
104-
105- return if rebaked >= limit
106-
107- # Finally, we'll try to backfill embeddings for posts that have outdated
108- # embeddings due to edits. Here we only do 10% of the limit
109- posts
110- . where ( "#{ table_name } .updated_at < ?" , 7 . days . ago )
111- . order ( "random()" )
112- . limit ( ( limit - rebaked ) / 10 )
113- . pluck ( :id )
114- . each_slice ( posts_batch_size ) do |batch |
115- vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
116- rebaked += batch . length
117- end
118-
119- rebaked
124+ . pluck ( :id )
125+ . each_slice ( posts_batch_size ) do |batch |
126+ vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
127+ rebaked += batch . length
128+ end
129+ end
120130 end
121131
122132 private
0 commit comments