@@ -18,105 +18,116 @@ def execute(args)
1818 )
1919 end
2020
21- rebaked = 0
22-
23- vector = DiscourseAi ::Embeddings ::Vector . instance
24- vector_def = vector . vdef
25- table_name = DiscourseAi ::Embeddings ::Schema ::TOPICS_TABLE
26-
27- topics =
28- Topic
29- . joins (
30- "LEFT JOIN #{ table_name } ON #{ table_name } .topic_id = topics.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
21+ production_vector = DiscourseAi ::Embeddings ::Vector . instance
22+ production_vector_def = production_vector . vdef
23+
24+ if SiteSetting . ai_embeddings_backfill_model . present? &&
25+ SiteSetting . ai_embeddings_backfill_model != SiteSetting . ai_embeddings_selected_model
26+ backfill_vector =
27+ DiscourseAi ::Embeddings ::Vector . new (
28+ EmbeddingDefinition . find_by ( id : SiteSetting . ai_embeddings_backfill_model ) ,
3129 )
32- . where ( archetype : Archetype . default )
33- . where ( deleted_at : nil )
34- . order ( "topics.bumped_at DESC" )
35-
36- rebaked += populate_topic_embeddings ( vector , topics . limit ( limit - rebaked ) )
37-
38- return if rebaked >= limit
39-
40- # Then, we'll try to backfill embeddings for topics that have outdated
41- # embeddings, be it model or strategy version
42- relation = topics . where ( <<~SQL ) . limit ( limit - rebaked )
43- #{ table_name } .model_version < #{ vector_def . version }
44- OR
45- #{ table_name } .strategy_version < #{ vector_def . strategy_version }
46- SQL
47-
48- rebaked += populate_topic_embeddings ( vector , relation , force : true )
49-
50- return if rebaked >= limit
30+ backfill_vector_def = backfill_vector . vdef
31+ end
5132
52- # Finally, we'll try to backfill embeddings for topics that have outdated
53- # embeddings due to edits or new replies. Here we only do 10% of the limit
54- relation =
55- topics
56- . where ( "#{ table_name } .updated_at < ?" , 6 . hours . ago )
57- . where ( "#{ table_name } .updated_at < topics.updated_at" )
33+ topic_work_list = [ ]
34+ topic_work_list << [ production_vector , production_vector_def ]
35+ topic_work_list << [ backfill_vector , backfill_vector_def ] if backfill_vector
36+
37+ topic_work_list . each do |vector , vector_def |
38+ rebaked = 0
39+ table_name = DiscourseAi ::Embeddings ::Schema ::TOPICS_TABLE
40+
41+ topics =
42+ Topic
43+ . joins (
44+ "LEFT JOIN #{ table_name } ON #{ table_name } .topic_id = topics.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
45+ )
46+ . where ( archetype : Archetype . default )
47+ . where ( deleted_at : nil )
48+ . order ( "topics.bumped_at DESC" )
49+
50+ rebaked += populate_topic_embeddings ( vector , topics . limit ( limit - rebaked ) )
51+
52+ next if rebaked >= limit
53+
54+ # Then, we'll try to backfill embeddings for topics that have outdated
55+ # embeddings, be it model or strategy version
56+ relation = topics . where ( <<~SQL ) . limit ( limit - rebaked )
57+ #{ table_name } .model_version < #{ vector_def . version }
58+ OR
59+ #{ table_name } .strategy_version < #{ vector_def . strategy_version }
60+ SQL
61+
62+ rebaked += populate_topic_embeddings ( vector , relation , force : true )
63+
64+ next if rebaked >= limit
65+
66+ # Finally, we'll try to backfill embeddings for topics that have outdated
67+ # embeddings due to edits or new replies. Here we only do 10% of the limit
68+ relation =
69+ topics
70+ . where ( "#{ table_name } .updated_at < ?" , 6 . hours . ago )
71+ . where ( "#{ table_name } .updated_at < topics.updated_at" )
72+ . limit ( ( limit - rebaked ) / 10 )
73+
74+ populate_topic_embeddings ( vector , relation , force : true )
75+
76+ next unless SiteSetting . ai_embeddings_per_post_enabled
77+
78+ # Now for posts
79+ table_name = DiscourseAi ::Embeddings ::Schema ::POSTS_TABLE
80+ posts_batch_size = 1000
81+
82+ posts =
83+ Post
84+ . joins (
85+ "LEFT JOIN #{ table_name } ON #{ table_name } .post_id = posts.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
86+ )
87+ . where ( deleted_at : nil )
88+ . where ( post_type : Post . types [ :regular ] )
89+
90+ # First, we'll try to backfill embeddings for posts that have none
91+ posts
92+ . where ( "#{ table_name } .post_id IS NULL" )
93+ . limit ( limit - rebaked )
94+ . pluck ( :id )
95+ . each_slice ( posts_batch_size ) do |batch |
96+ vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
97+ rebaked += batch . length
98+ end
99+
100+ next if rebaked >= limit
101+
102+ # Then, we'll try to backfill embeddings for posts that have outdated
103+ # embeddings, be it model or strategy version
104+ posts
105+ . where ( <<~SQL )
106+ #{ table_name } .model_version < #{ vector_def . version }
107+ OR
108+ #{ table_name } .strategy_version < #{ vector_def . strategy_version }
109+ SQL
110+ . limit ( limit - rebaked )
111+ . pluck ( :id )
112+ . each_slice ( posts_batch_size ) do |batch |
113+ vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
114+ rebaked += batch . length
115+ end
116+
117+ next if rebaked >= limit
118+
119+ # Finally, we'll try to backfill embeddings for posts that have outdated
120+ # embeddings due to edits. Here we only do 10% of the limit
121+ posts
122+ . where ( "#{ table_name } .updated_at < ?" , 7 . days . ago )
123+ . order ( "random()" )
58124 . limit ( ( limit - rebaked ) / 10 )
59-
60- populate_topic_embeddings ( vector , relation , force : true )
61-
62- return if rebaked >= limit
63-
64- return unless SiteSetting . ai_embeddings_per_post_enabled
65-
66- # Now for posts
67- table_name = DiscourseAi ::Embeddings ::Schema ::POSTS_TABLE
68- posts_batch_size = 1000
69-
70- posts =
71- Post
72- . joins (
73- "LEFT JOIN #{ table_name } ON #{ table_name } .post_id = posts.id AND #{ table_name } .model_id = #{ vector_def . id } " ,
74- )
75- . where ( deleted_at : nil )
76- . where ( post_type : Post . types [ :regular ] )
77-
78- # First, we'll try to backfill embeddings for posts that have none
79- posts
80- . where ( "#{ table_name } .post_id IS NULL" )
81- . limit ( limit - rebaked )
82- . pluck ( :id )
83- . each_slice ( posts_batch_size ) do |batch |
84- vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
85- rebaked += batch . length
86- end
87-
88- return if rebaked >= limit
89-
90- # Then, we'll try to backfill embeddings for posts that have outdated
91- # embeddings, be it model or strategy version
92- posts
93- . where ( <<~SQL )
94- #{ table_name } .model_version < #{ vector_def . version }
95- OR
96- #{ table_name } .strategy_version < #{ vector_def . strategy_version }
97- SQL
98- . limit ( limit - rebaked )
99- . pluck ( :id )
100- . each_slice ( posts_batch_size ) do |batch |
101- vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
102- rebaked += batch . length
103- end
104-
105- return if rebaked >= limit
106-
107- # Finally, we'll try to backfill embeddings for posts that have outdated
108- # embeddings due to edits. Here we only do 10% of the limit
109- posts
110- . where ( "#{ table_name } .updated_at < ?" , 7 . days . ago )
111- . order ( "random()" )
112- . limit ( ( limit - rebaked ) / 10 )
113- . pluck ( :id )
114- . each_slice ( posts_batch_size ) do |batch |
115- vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
116- rebaked += batch . length
117- end
118-
119- rebaked
125+ . pluck ( :id )
126+ . each_slice ( posts_batch_size ) do |batch |
127+ vector . gen_bulk_reprensentations ( Post . where ( id : batch ) )
128+ rebaked += batch . length
129+ end
130+ end
120131 end
121132
122133 private
0 commit comments