FIX: Open AI embeddings config migration & Seeded indexes cleanup &

romanrizzi · romanrizzi · commit 55743cb6ab2f · 2025-01-27T14:32:22.000-03:00
This change fixes two different problems. First, we add a data migration to migrate the configuration of sites using Open AI's embedding model. There was a window between the embedding config changes and #1087, where sites could end up in a broken state due to an unconfigured selected model setting, as reported on https://meta.discourse.org/t/-/348964 The second fix drops pre-seeded search indexes of the models we didn't migrate and corrects the ones where the dimensions don't match. Since the index uses the model ID, new embedding configs could use one of these ones even when the dimensions no longer match.
diff --git a/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config.rb b/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config.rb
@@ -0,0 +1,107 @@
+# frozen_string_literal: true
+
+class FixBrokenAiEmbeddingsConfig < ActiveRecord::Migration[7.2]
+  def up
+    return if fetch_setting("ai_embeddings_selected_model").present?
+
+    return if DB.query_single("SELECT COUNT(*) FROM embedding_definitions").first > 0
+
+    open_ai_models = %w[text-embedding-3-large text-embedding-3-small text-embedding-ada-002]
+    current_model = fetch_setting("ai_embeddings_model")
+    return if !open_ai_models.include?(current_model)
+
+    endpoint = fetch_setting("ai_openai_embeddings_url") || "https://api.openai.com/v1/embeddings"
+    api_key = fetch_setting("ai_openai_api_key")
+    return if api_key.blank?
+
+    attrs = {
+      display_name: current_model,
+      url: endpoint,
+      api_key: api_key,
+      provider: "open_ai",
+    }.merge(model_attrs(current_model))
+
+    persist_config(attrs)
+  end
+
+  def fetch_setting(name)
+    DB.query_single(
+      "SELECT value FROM site_settings WHERE name = :setting_name",
+      setting_name: name,
+    ).first || ENV["DISCOURSE_#{name&.upcase}"]
+  end
+
+  def model_attrs(model_name)
+    if model_name == "text-embedding-3-large"
+      {
+        dimensions: 2000,
+        max_sequence_length: 8191,
+        id: 7,
+        pg_function: "<=>",
+        tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
+        matryoshka_dimensions: true,
+        provider_params: {
+          model_name: "text-embedding-3-large",
+        },
+      }
+    elsif model_name == "text-embedding-3-small"
+      {
+        dimensions: 1536,
+        max_sequence_length: 8191,
+        id: 6,
+        pg_function: "<=>",
+        tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
+        provider_params: {
+          model_name: "text-embedding-3-small",
+        },
+      }
+    else
+      {
+        dimensions: 1536,
+        max_sequence_length: 8191,
+        id: 2,
+        pg_function: "<=>",
+        tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
+        provider_params: {
+          model_name: "text-embedding-ada-002",
+        },
+      }
+    end
+  end
+
+  def persist_config(attrs)
+    DB.exec(
+      <<~SQL,
+      INSERT INTO embedding_definitions (id, display_name, dimensions, max_sequence_length, version, pg_function, provider, tokenizer_class, url, api_key, provider_params, matryoshka_dimensions, created_at, updated_at)
+      VALUES (:id, :display_name, :dimensions, :max_sequence_length, 1, :pg_function, :provider, :tokenizer_class, :url, :api_key, :provider_params, :matryoshka_dimensions, :now, :now)
+      SQL
+      id: attrs[:id],
+      display_name: attrs[:display_name],
+      dimensions: attrs[:dimensions],
+      max_sequence_length: attrs[:max_sequence_length],
+      pg_function: attrs[:pg_function],
+      provider: attrs[:provider],
+      tokenizer_class: attrs[:tokenizer_class],
+      url: attrs[:url],
+      api_key: attrs[:api_key],
+      provider_params: attrs[:provider_params]&.to_json,
+      matryoshka_dimensions: !!attrs[:matryoshka_dimensions],
+      now: Time.zone.now,
+    )
+
+    # We hardcoded the ID to match with already generated embeddings. Let's restart the seq to avoid conflicts.
+    DB.exec(
+      "ALTER SEQUENCE embedding_definitions_id_seq RESTART WITH :new_seq",
+      new_seq: attrs[:id].to_i + 1,
+    )
+
+    DB.exec(<<~SQL, new_value: attrs[:id])
+      INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
+      VALUES ('ai_embeddings_selected_model', 3, ':new_value', NOW(), NOW())
+    SQL
+  end
+
+  def down
+    raise ActiveRecord::IrreversibleMigration
+  end
+end
diff --git a/db/migrate/20250127145305_clean_unused_embedding_search_indexes.rb b/db/migrate/20250127145305_clean_unused_embedding_search_indexes.rb
@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+class CleanUnusedEmbeddingSearchIndexes < ActiveRecord::Migration[7.2]
+  def up
+    existing_definitions =
+      DB.query("SELECT id, dimensions FROM embedding_definitions WHERE id <= 8")
+
+    drop_statements =
+      (1..8)
+        .reduce([]) do |memo, model_id|
+          model = existing_definitions.find { |ed| ed&.id == model_id }
+
+          if model.blank? || !correctly_indexed?(model)
+            embedding_tables.each do |type|
+              memo << "DROP INDEX IF EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit;"
+            end
+          end
+
+          memo
+        end
+        .join("\n")
+
+    DB.exec(drop_statements) if drop_statements.present?
+
+    amend_statements =
+      (1..8)
+        .reduce([]) do |memo, model_id|
+          model = existing_definitions.find { |ed| ed&.id == model_id }
+
+          memo << amended_idxs(model) if model.present? && !correctly_indexed?(model)
+
+          memo
+        end
+        .join("\n")
+
+    DB.exec(amend_statements) if amend_statements.present?
+  end
+
+  def embedding_tables
+    %w[topics posts document_fragments]
+  end
+
+  def amended_idxs(model)
+    embedding_tables.map { |t| <<~SQL }.join("\n")
+      CREATE INDEX IF NOT EXISTS ai_#{t}_embeddings_#{model.id}_1_search_bit ON ai_#{t}_embeddings
+      USING hnsw ((binary_quantize(embeddings)::bit(#{model.dimensions})) bit_hamming_ops)
+      WHERE model_id = #{model.id} AND strategy_id = 1;
+    SQL
+  end
+
+  def correctly_indexed?(edef)
+    seeded_dimensions[edef.id] == edef.dimensions
+  end
+
+  def seeded_dimensions
+    { 1 => 768, 2 => 1536, 3 => 1024, 4 => 1024, 5 => 768, 6 => 1536, 7 => 2000, 8 => 1024 }
+  end
+
+  def down
+    raise ActiveRecord::IrreversibleMigration
+  end
+end
diff --git a/spec/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config_spec.rb b/spec/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config_spec.rb
@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+require Rails.root.join(
+          "plugins/discourse-ai/db/migrate/20250125162658_fix_broken_open_ai_embeddings_config",
+        )
+
+RSpec.describe FixBrokenOpenAiEmbeddingsConfig do
+  let(:connection) { ActiveRecord::Base.connection }
+
+  def store_setting(name, val)
+    connection.execute <<~SQL
+      INSERT INTO site_settings(name, data_type, value, created_at, updated_at)
+      VALUES ('#{name}', 3, '#{val}', NOW(), NOW())
+    SQL
+  end
+
+  def configured_model_id
+    DB.query_single(
+      "SELECT value FROM site_settings WHERE name = 'ai_embeddings_selected_model'",
+    ).first
+  end
+
+  describe "#up" do
+    context "when embeddings are already configured" do
+      fab!(:embedding_definition)
+
+      before { store_setting("ai_embeddings_selected_model", embedding_definition.id) }
+
+      it "does nothing" do
+        subject.up
+
+        expect(configured_model_id).to eq(embedding_definition.id.to_s)
+      end
+    end
+
+    context "when there is no previous config" do
+      it "does nothing" do
+        subject.up
+
+        expect(configured_model_id).to be_blank
+      end
+    end
+
+    context "when things are not fully configured" do
+      before do
+        store_setting("ai_embeddings_model", "text-embedding-3-large")
+        store_setting("ai_openai_api_key", "")
+      end
+
+      it "does nothing" do
+        subject.up
+
+        expect(configured_model_id).to be_blank
+      end
+    end
+
+    context "when we have a configuration that previously failed to copy" do
+      before do
+        store_setting("ai_embeddings_model", "text-embedding-3-large")
+        store_setting("ai_openai_api_key", "123")
+      end
+
+      it "copies the config" do
+        subject.up
+
+        embedding_def = EmbeddingDefinition.last
+
+        expect(configured_model_id).to eq(embedding_def.id.to_s)
+      end
+    end
+  end
+end
diff --git a/spec/db/migrate/20250127145305_clean_unused_embedding_search_indexes_spec.rb b/spec/db/migrate/20250127145305_clean_unused_embedding_search_indexes_spec.rb
@@ -0,0 +1,136 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+require Rails.root.join(
+          "plugins/discourse-ai/db/migrate/20250127145305_clean_unused_embedding_search_indexes",
+        )
+
+RSpec.describe CleanUnusedEmbeddingSearchIndexes do
+  let(:connection) { ActiveRecord::Base.connection }
+
+  describe "#up" do
+    before do
+      # Copied from 20241008054440_create_binary_indexes_for_embeddings
+      %w[topics posts document_fragments].each do |type|
+        # our supported embeddings models IDs and dimensions
+        [
+          [1, 768],
+          [2, 1536],
+          [3, 1024],
+          [4, 1024],
+          [5, 768],
+          [6, 1536],
+          [7, 2000],
+          [8, 1024],
+        ].each { |model_id, dimensions| connection.execute <<-SQL }
+          CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_#{model_id}_1_search_bit ON ai_#{type}_embeddings
+          USING hnsw ((binary_quantize(embeddings)::bit(#{dimensions})) bit_hamming_ops)
+          WHERE model_id = #{model_id} AND strategy_id = 1;
+        SQL
+      end
+    end
+
+    let(:all_idx_names) do
+      %w[topics posts document_fragments].reduce([]) do |memo, type|
+        (1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }
+
+        memo
+      end
+    end
+
+    context "when there are no embedding definitions" do
+      it "removes all indexes" do
+        subject.up
+
+        remaininig_idxs =
+          DB.query_single(
+            "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
+            names: all_idx_names,
+          )
+
+        expect(remaininig_idxs).to be_empty
+      end
+    end
+
+    context "when there is an embedding definition with the same dimensions" do
+      fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
+
+      it "keeps the matching index and removes the rest" do
+        expected_model_idxs =
+          %w[topics posts document_fragments].reduce([]) do |memo, type|
+            memo << "ai_#{type}_embeddings_2_1_search_bit"
+          end
+
+        subject.up
+
+        remaininig_idxs =
+          DB.query_single(
+            "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
+            names: all_idx_names,
+          )
+
+        expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
+        # This method checks dimensions are correct.
+        expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def)).to eq(true)
+      end
+    end
+
+    context "when there is an embedding definition with different dimenstions" do
+      fab!(:embedding_def) { Fabricate(:embedding_definition, id: 2, dimensions: 1536) }
+      fab!(:embedding_def_2) { Fabricate(:embedding_definition, id: 3, dimensions: 768) }
+
+      it "updates the index to use the right dimensions" do
+        expected_model_idxs =
+          %w[topics posts document_fragments].reduce([]) do |memo, type|
+            memo << "ai_#{type}_embeddings_2_1_search_bit"
+            memo << "ai_#{type}_embeddings_3_1_search_bit"
+          end
+
+        subject.up
+
+        remaininig_idxs =
+          DB.query_single(
+            "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
+            names: all_idx_names,
+          )
+
+        expect(remaininig_idxs).to contain_exactly(*expected_model_idxs)
+        # This method checks dimensions are correct.
+        expect(DiscourseAi::Embeddings::Schema.correctly_indexed?(embedding_def_2)).to eq(true)
+      end
+    end
+
+    context "when there are indexes outside the pre-seeded range" do
+      before { %w[topics posts document_fragments].each { |type| connection.execute <<-SQL } }
+            CREATE INDEX IF NOT EXISTS ai_#{type}_embeddings_9_1_search_bit ON ai_#{type}_embeddings
+            USING hnsw ((binary_quantize(embeddings)::bit(556)) bit_hamming_ops)
+            WHERE model_id = 9 AND strategy_id = 1;
+          SQL
+
+      let(:all_idx_names) do
+        %w[topics posts document_fragments].reduce([]) do |memo, type|
+          (1..8).each { |model_id| memo << "ai_#{type}_embeddings_#{model_id}_1_search_bit" }
+
+          memo
+        end
+      end
+
+      it "leaves them untouched" do
+        expected_model_idxs =
+          %w[topics posts document_fragments].reduce([]) do |memo, type|
+            memo << "ai_#{type}_embeddings_9_1_search_bit"
+          end
+
+        subject.up
+
+        other_idxs =
+          DB.query_single(
+            "SELECT indexname FROM pg_indexes WHERE indexname IN (:names)",
+            names: expected_model_idxs,
+          )
+
+        expect(other_idxs).to contain_exactly(*expected_model_idxs)
+      end
+    end
+  end
+end