Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 55743cb

Browse files
committed
FIX: Open AI embeddings config migration & Seeded indexes cleanup &
This change fixes two different problems. First, we add a data migration to migrate the configuration of sites using Open AI's embedding model. There was a window between the embedding config changes and #1087, where sites could end up in a broken state due to an unconfigured selected model setting, as reported on https://meta.discourse.org/t/-/348964 The second fix drops pre-seeded search indexes of the models we didn't migrate and corrects the ones where the dimensions don't match. Since the index uses the model ID, new embedding configs could use one of these ones even when the dimensions no longer match.
1 parent ad7bb9b commit 55743cb

4 files changed

+377
-0
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# frozen_string_literal: true
2+
3+
class FixBrokenAiEmbeddingsConfig < ActiveRecord::Migration[7.2]
4+
def up
5+
return if fetch_setting("ai_embeddings_selected_model").present?
6+
7+
return if DB.query_single("SELECT COUNT(*) FROM embedding_definitions").first > 0
8+
9+
open_ai_models = %w[text-embedding-3-large text-embedding-3-small text-embedding-ada-002]
10+
current_model = fetch_setting("ai_embeddings_model")
11+
return if !open_ai_models.include?(current_model)
12+
13+
endpoint = fetch_setting("ai_openai_embeddings_url") || "https://api.openai.com/v1/embeddings"
14+
api_key = fetch_setting("ai_openai_api_key")
15+
return if api_key.blank?
16+
17+
attrs = {
18+
display_name: current_model,
19+
url: endpoint,
20+
api_key: api_key,
21+
provider: "open_ai",
22+
}.merge(model_attrs(current_model))
23+
24+
persist_config(attrs)
25+
end
26+
27+
def fetch_setting(name)
28+
DB.query_single(
29+
"SELECT value FROM site_settings WHERE name = :setting_name",
30+
setting_name: name,
31+
).first || ENV["DISCOURSE_#{name&.upcase}"]
32+
end
33+
34+
def model_attrs(model_name)
35+
if model_name == "text-embedding-3-large"
36+
{
37+
dimensions: 2000,
38+
max_sequence_length: 8191,
39+
id: 7,
40+
pg_function: "<=>",
41+
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
42+
matryoshka_dimensions: true,
43+
provider_params: {
44+
model_name: "text-embedding-3-large",
45+
},
46+
}
47+
elsif model_name == "text-embedding-3-small"
48+
{
49+
dimensions: 1536,
50+
max_sequence_length: 8191,
51+
id: 6,
52+
pg_function: "<=>",
53+
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
54+
provider_params: {
55+
model_name: "text-embedding-3-small",
56+
},
57+
}
58+
else
59+
{
60+
dimensions: 1536,
61+
max_sequence_length: 8191,
62+
id: 2,
63+
pg_function: "<=>",
64+
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
65+
provider_params: {
66+
model_name: "text-embedding-ada-002",
67+
},
68+
}
69+
end
70+
end
71+
72+
def persist_config(attrs)
73+
DB.exec(
74+
<<~SQL,
75+
INSERT INTO embedding_definitions (id, display_name, dimensions, max_sequence_length, version, pg_function, provider, tokenizer_class, url, api_key, provider_params, matryoshka_dimensions, created_at, updated_at)
76+
VALUES (:id, :display_name, :dimensions, :max_sequence_length, 1, :pg_function, :provider, :tokenizer_class, :url, :api_key, :provider_params, :matryoshka_dimensions, :now, :now)
77+
SQL
78+
id: attrs[:id],
79+
display_name: attrs[:display_name],
80+
dimensions: attrs[:dimensions],
81+
max_sequence_length: attrs[:max_sequence_length],
82+
pg_function: attrs[:pg_function],
83+
provider: attrs[:provider],
84+
tokenizer_class: attrs[:tokenizer_class],
85+
url: attrs[:url],
86+
api_key: attrs[:api_key],
87+
provider_params: attrs[:provider_params]&.to_json,
88+
matryoshka_dimensions: !!attrs[:matryoshka_dimensions],
89+
now: Time.zone.now,
90+
)
91+
92+
# We hardcoded the ID to match with already generated embeddings. Let's restart the seq to avoid conflicts.
93+
DB.exec(
94+
"ALTER SEQUENCE embedding_definitions_id_seq RESTART WITH :new_seq",
95+
new_seq: attrs[:id].to_i + 1,
96+
)
97+
98+
DB.exec(<<~SQL, new_value: attrs[:id])
99+
INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
100+
VALUES ('ai_embeddings_selected_model', 3, ':new_value', NOW(), NOW())
101+
SQL
102+
end
103+
104+
def down
105+
raise ActiveRecord::IrreversibleMigration
106+
end
107+
end
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# frozen_string_literal: true
2+
class CleanUnusedEmbeddingSearchIndexes < ActiveRecord::Migration[7.2]
3+
def up
4+
existing_definitions =
5+
DB.query("SELECT id, dimensions FROM embedding_definitions WHERE id <= 8")
6+
7+
drop_statements =
8+
(1..8)
9+
.reduce([]) do |memo, model_id|
10+
model = existing_definitions.find { |ed| ed&.id == model_id }
11+
12+
if model.blank? || !correctly_indexed?(model)
13+
embedding_tables.each do |type|
14+
memo << "DROP INDEX IF EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit;"
15+
end
16+
end
17+
18+
memo
19+
end
20+
.join("\n")
21+
22+
DB.exec(drop_statements) if drop_statements.present?
23+
24+
amend_statements =
25+
(1..8)
26+
.reduce([]) do |memo, model_id|
27+
model = existing_definitions.find { |ed| ed&.id == model_id }
28+
29+
memo << amended_idxs(model) if model.present? && !correctly_indexed?(model)
30+
31+
memo
32+
end
33+
.join("\n")
34+
35+
DB.exec(amend_statements) if amend_statements.present?
36+
end
37+
38+
def embedding_tables
39+
%w[topics posts document_fragments]
40+
end
41+
42+
def amended_idxs(model)
43+
embedding_tables.map { |t| <<~SQL }.join("\n")
44+
CREATE INDEX IF NOT EXISTS ai_#{t}_embeddings_#{model.id}_1_search_bit ON ai_#{t}_embeddings
45+
USING hnsw ((binary_quantize(embeddings)::bit(#{model.dimensions})) bit_hamming_ops)
46+
WHERE model_id = #{model.id} AND strategy_id = 1;
47+
SQL
48+
end
49+
50+
def correctly_indexed?(edef)
51+
seeded_dimensions[edef.id] == edef.dimensions
52+
end
53+
54+
def seeded_dimensions
55+
{ 1 => 768, 2 => 1536, 3 => 1024, 4 => 1024, 5 => 768, 6 => 1536, 7 => 2000, 8 => 1024 }
56+
end
57+
58+
def down
59+
raise ActiveRecord::IrreversibleMigration
60+
end
61+
end
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# frozen_string_literal: true
2+
3+
require "rails_helper"
4+
require Rails.root.join(
5+
"plugins/discourse-ai/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config",
6+
)
7+
8+
RSpec.describe FixBrokenOpenAiEmbeddingsConfig do
9+
let(:connection) { ActiveRecord::Base.connection }
10+
11+
def store_setting(name, val)
12+
connection.execute <<~SQL
13+
INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
14+
VALUES ('#{name}', 3, '#{val}', NOW(), NOW())
15+
SQL
16+
end
17+
18+
def configured_model_id
19+
DB.query_single(
20+
"SELECT value FROM site_settings WHERE name = 'ai_embeddings_selected_model'",
21+
).first
22+
end
23+
24+
describe "#up" do
25+
context "when embeddings are already configured" do
26+
fab!(:embedding_definition)
27+
28+
before { store_setting("ai_embeddings_selected_model", embedding_definition.id) }
29+
30+
it "does nothing" do
31+
subject.up
32+
33+
expect(configured_model_id).to eq(embedding_definition.id.to_s)
34+
end
35+
end
36+
37+
context "when there is no previous config" do
38+
it "does nothing" do
39+
subject.up
40+
41+
expect(configured_model_id).to be_blank
42+
end
43+
end
44+
45+
context "when things are not fully configured" do
46+
before do
47+
store_setting("ai_embeddings_model", "text-embedding-3-large")
48+
store_setting("ai_openai_api_key", "")
49+
end
50+
51+
it "does nothing" do
52+
subject.up
53+
54+
expect(configured_model_id).to be_blank
55+
end
56+
end
57+
58+
context "when we have a configuration that previously failed to copy" do
59+
before do
60+
store_setting("ai_embeddings_model", "text-embedding-3-large")
61+
store_setting("ai_openai_api_key", "123")
62+
end
63+
64+
it "copies the config" do
65+
subject.up
66+
67+
embedding_def = EmbeddingDefinition.last
68+
69+
expect(configured_model_id).to eq(embedding_def.id.to_s)
70+
end
71+
end
72+
end
73+
end
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# frozen_string_literal: true
2+
3+
require "rails_helper"
4+
require Rails.root.join(
5+
"plugins/discourse-ai/db/migrate/20250127145305_clean_unused_embedding_search_indexes",
6+
)
7+
8+
RSpec.describe CleanUnusedEmbeddingSearchIndexes do
9+
let(:connection) { ActiveRecord::Base.connection }
10+
11+
describe "#up" do
12+
before do
13+
# Copied from 20241008054440_create_binary_indexes_for_embeddings
14+
%w[topics posts document_fragments].each do |type|
15+
# our supported embeddings models IDs and dimensions
16+
[
17+
[1, 768],
18+
[2, 1536],
19+
[3, 1024],
20+
[4, 1024],
21+
[5, 768],
22+
[6, 1536],
23+
[7, 2000],
24+
[8, 1024],
25+
].each { |model_id, dimensions| connection.execute <<-SQL }
26+
CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit ON ai_#{type}_embeddings
27+
USING hnsw ((binary_quantize(embeddings)::bit(#{dimensions})) bit_hamming_ops)
28+
WHERE model_id = #{model_id} AND strategy_id = 1;
29+
SQL
30+
end
31+
end
32+
33+
let(:all_idx_names) do
34+
%w[topics posts document_fragments].reduce([]) do |memo, type|
35+
(1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }
36+
37+
memo
38+
end
39+
end
40+
41+
context "when there are no embedding definitions" do
42+
it "removes all indexes" do
43+
subject.up
44+
45+
remaininig_idxs =
46+
DB.query_single(
47+
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
48+
names: all_idx_names,
49+
)
50+
51+
expect(remaininig_idxs).to be_empty
52+
end
53+
end
54+
55+
context "when there is an embedding definition with the same dimensions" do
56+
fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
57+
58+
it "keeps the matching index and removes the rest" do
59+
expected_model_idxs =
60+
%w[topics posts document_fragments].reduce([]) do |memo, type|
61+
memo << "ai_#{type}_embeddings_2_1_search_bit"
62+
end
63+
64+
subject.up
65+
66+
remaininig_idxs =
67+
DB.query_single(
68+
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
69+
names: all_idx_names,
70+
)
71+
72+
expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
73+
# This method checks dimensions are correct.
74+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def)).to eq(true)
75+
end
76+
end
77+
78+
context "when there is an embedding definition with different dimenstions" do
79+
fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
80+
fab!(:embedding_def_2) { Fabricate(:embedding_definition, id: 3, dimensions: 768) }
81+
82+
it "updates the index to use the right dimensions" do
83+
expected_model_idxs =
84+
%w[topics posts document_fragments].reduce([]) do |memo, type|
85+
memo << "ai_#{type}_embeddings_2_1_search_bit"
86+
memo << "ai_#{type}_embeddings_3_1_search_bit"
87+
end
88+
89+
subject.up
90+
91+
remaininig_idxs =
92+
DB.query_single(
93+
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
94+
names: all_idx_names,
95+
)
96+
97+
expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
98+
# This method checks dimensions are correct.
99+
expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def_2)).to eq(true)
100+
end
101+
end
102+
103+
context "when there are indexes outside the pre-seeded range" do
104+
before { %w[topics posts document_fragments].each { |type| connection.execute <<-SQL } }
105+
CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_9_1_search_bit ON ai_#{type}_embeddings
106+
USING hnsw ((binary_quantize(embeddings)::bit(556)) bit_hamming_ops)
107+
WHERE model_id = 9 AND strategy_id = 1;
108+
SQL
109+
110+
let(:all_idx_names) do
111+
%w[topics posts document_fragments].reduce([]) do |memo, type|
112+
(1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }
113+
114+
memo
115+
end
116+
end
117+
118+
it "leaves them untouched" do
119+
expected_model_idxs =
120+
%w[topics posts document_fragments].reduce([]) do |memo, type|
121+
memo << "ai_#{type}_embeddings_9_1_search_bit"
122+
end
123+
124+
subject.up
125+
126+
other_idxs =
127+
DB.query_single(
128+
"SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
129+
names: expected_model_idxs,
130+
)
131+
132+
expect(other_idxs).to contain_exactly(*expected_model_idxs)
133+
end
134+
end
135+
end
136+
end

0 commit comments

Comments
 (0)