From 45e1b0fcaa8b06eab5cb9f9cea41ba83737971c3 Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Tue, 15 Jul 2025 14:16:38 -0300 Subject: [PATCH] FIX: Fix embeddings to use the old OpenAI tokenizer --- app/models/embedding_definition.rb | 6 +++--- ...15165701_update_open_ai_embeddings_tokenizer.rb | 14 ++++++++++++++ .../embeddings/ai_embedding_definition_spec.rb | 2 +- 3 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 db/migrate/20250715165701_update_open_ai_embeddings_tokenizer.rb diff --git a/app/models/embedding_definition.rb b/app/models/embedding_definition.rb index 23b37ec4a..92caabee6 100644 --- a/app/models/embedding_definition.rb +++ b/app/models/embedding_definition.rb @@ -84,7 +84,7 @@ def presets dimensions: 2000, max_sequence_length: 8191, pg_function: "<=>", - tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer", + tokenizer_class: "DiscourseAi::Tokenizer::OpenAiCl100kTokenizer", url: "https://api.openai.com/v1/embeddings", provider: OPEN_AI, matryoshka_dimensions: true, @@ -98,7 +98,7 @@ def presets dimensions: 1536, max_sequence_length: 8191, pg_function: "<=>", - tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer", + tokenizer_class: "DiscourseAi::Tokenizer::OpenAiCl100kTokenizer", url: "https://api.openai.com/v1/embeddings", provider: OPEN_AI, matryoshka_dimensions: true, @@ -112,7 +112,7 @@ def presets dimensions: 1536, max_sequence_length: 8191, pg_function: "<=>", - tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer", + tokenizer_class: "DiscourseAi::Tokenizer::OpenAiCl100kTokenizer", url: "https://api.openai.com/v1/embeddings", provider: OPEN_AI, provider_params: { diff --git a/db/migrate/20250715165701_update_open_ai_embeddings_tokenizer.rb b/db/migrate/20250715165701_update_open_ai_embeddings_tokenizer.rb new file mode 100644 index 000000000..1d8b38885 --- /dev/null +++ b/db/migrate/20250715165701_update_open_ai_embeddings_tokenizer.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true +class UpdateOpenAiEmbeddingsTokenizer < ActiveRecord::Migration[7.2] + def up + execute <<~SQL + UPDATE embedding_definitions + SET tokenizer_class = 'DiscourseAi::Tokenizer::OpenAiCl100kTokenizer' + WHERE url LIKE '%https://api.openai.com/%' AND tokenizer_class <> 'DiscourseAi::Tokenizer::OpenAiCl100kTokenizer' + SQL + end + + def down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/spec/system/embeddings/ai_embedding_definition_spec.rb b/spec/system/embeddings/ai_embedding_definition_spec.rb index f20a33276..481fdac97 100644 --- a/spec/system/embeddings/ai_embedding_definition_spec.rb +++ b/spec/system/embeddings/ai_embedding_definition_spec.rb @@ -50,7 +50,7 @@ form.field("provider").select(EmbeddingDefinition::OPEN_AI) form.field("url").fill_in("https://api.openai.com/v1/embeddings") form.field("api_key").fill_in(api_key) - form.field("tokenizer_class").select("DiscourseAi::Tokenizer::OpenAiTokenizer") + form.field("tokenizer_class").select("DiscourseAi::Tokenizer::OpenAiCl100kTokenizer") embed_prefix = "On creation:" search_prefix = "On search:"