Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit e2e753d

Browse files
authored
FEATURE: Formalize support for matryoshka dimensions. (#1083)
We have a flag to signal we are shortening the embeddings of a model. Only used in Open AI's text-embedding-3-*, but we plan to use it for other services.
1 parent 654f90f commit e2e753d

File tree

10 files changed

+66
-27
lines changed

10 files changed

+66
-27
lines changed

app/controllers/discourse_ai/admin/ai_embeddings_controller.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def ai_embeddings_params
113113
:tokenizer_class,
114114
:embed_prompt,
115115
:search_prompt,
116+
:matryoshka_dimensions,
116117
)
117118

118119
extra_field_names = EmbeddingDefinition.provider_params.dig(permitted[:provider]&.to_sym)

app/models/embedding_definition.rb

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ def presets
8484
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
8585
url: "https://api.openai.com/v1/embeddings",
8686
provider: OPEN_AI,
87+
matryoshka_dimensions: true,
8788
provider_params: {
8889
model_name: "text-embedding-3-large",
8990
},
@@ -97,6 +98,7 @@ def presets
9798
tokenizer_class: "DiscourseAi::Tokenizer::OpenAiTokenizer",
9899
url: "https://api.openai.com/v1/embeddings",
99100
provider: OPEN_AI,
101+
matryoshka_dimensions: true,
100102
provider_params: {
101103
model_name: "text-embedding-3-small",
102104
},
@@ -200,9 +202,7 @@ def hugging_face_client
200202
end
201203

202204
def open_ai_client
203-
model_name = lookup_custom_param("model_name")
204-
can_shorten_dimensions = %w[text-embedding-3-small text-embedding-3-large].include?(model_name)
205-
client_dimensions = can_shorten_dimensions ? dimensions : nil
205+
client_dimensions = matryoshka_dimensions ? dimensions : nil
206206

207207
DiscourseAi::Inference::OpenAiEmbeddings.new(
208208
endpoint_url,
@@ -221,20 +221,21 @@ def gemini_client
221221
#
222222
# Table name: embedding_definitions
223223
#
224-
# id :bigint not null, primary key
225-
# display_name :string not null
226-
# dimensions :integer not null
227-
# max_sequence_length :integer not null
228-
# version :integer default(1), not null
229-
# pg_function :string not null
230-
# provider :string not null
231-
# tokenizer_class :string not null
232-
# url :string not null
233-
# api_key :string
234-
# seeded :boolean default(FALSE), not null
235-
# provider_params :jsonb
236-
# created_at :datetime not null
237-
# updated_at :datetime not null
238-
# embed_prompt :string default(""), not null
239-
# search_prompt :string default(""), not null
224+
# id :bigint not null, primary key
225+
# display_name :string not null
226+
# dimensions :integer not null
227+
# max_sequence_length :integer not null
228+
# version :integer default(1), not null
229+
# pg_function :string not null
230+
# provider :string not null
231+
# tokenizer_class :string not null
232+
# url :string not null
233+
# api_key :string
234+
# seeded :boolean default(FALSE), not null
235+
# provider_params :jsonb
236+
# created_at :datetime not null
237+
# updated_at :datetime not null
238+
# embed_prompt :string default(""), not null
239+
# search_prompt :string default(""), not null
240+
# matryoshka_dimensions :boolean default(FALSE), not null
240241
#

app/serializers/ai_embedding_definition_serializer.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class AiEmbeddingDefinitionSerializer < ApplicationSerializer
1515
:tokenizer_class,
1616
:embed_prompt,
1717
:search_prompt,
18+
:matryoshka_dimensions,
1819
:provider_params
1920

2021
def api_key

assets/javascripts/discourse/admin/models/ai-embedding.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ export default class AiEmbedding extends RestModel {
1616
"provider_params",
1717
"pg_function",
1818
"embed_prompt",
19-
"search_prompt"
19+
"search_prompt",
20+
"matryoshka_dimensions"
2021
);
2122
}
2223

assets/javascripts/discourse/components/ai-embedding-editor.gjs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,16 @@ export default class AiEmbeddingEditor extends Component {
290290
{{/if}}
291291
</div>
292292

293+
<div class="control-group ai-embedding-editor__matryoshka_dimensions">
294+
<Input
295+
@type="checkbox"
296+
@checked={{this.editingModel.matryoshka_dimensions}}
297+
/>
298+
<label>{{i18n
299+
"discourse_ai.embeddings.matryoshka_dimensions"
300+
}}</label>
301+
</div>
302+
293303
<div class="control-group">
294304
<label>{{i18n "discourse_ai.embeddings.embed_prompt"}}</label>
295305
<Input

assets/stylesheets/modules/embeddings/common/ai-embedding-editor.scss

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,9 @@
2323
display: flex;
2424
align-items: center;
2525
}
26+
27+
&__matryoshka_dimensions {
28+
display: flex;
29+
align-items: flex-start;
30+
}
2631
}

config/locales/client.en.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,7 @@ en:
532532
max_sequence_length: "Sequence length"
533533
embed_prompt: "Embed prompt"
534534
search_prompt: "Search prompt"
535+
matryoshka_dimensions: "Matryoshka dimensions"
535536

536537
distance_function: "Distance function"
537538
distance_functions:
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
class MatryoshkaDimensionsSupport < ActiveRecord::Migration[7.2]
3+
def change
4+
add_column :embedding_definitions, :matryoshka_dimensions, :boolean, null: false, default: false
5+
6+
execute <<~SQL
7+
UPDATE embedding_definitions
8+
SET matryoshka_dimensions = TRUE
9+
WHERE
10+
provider = 'open_ai' AND
11+
provider_params IS NOT NULL AND
12+
(
13+
(provider_params->>'model_name') = 'text-embedding-3-large' OR
14+
(provider_params->>'model_name') = 'text-embedding-3-small'
15+
)
16+
SQL
17+
end
18+
19+
def down
20+
raise ActiveRecord::IrreversibleMigration
21+
end
22+
end

spec/lib/modules/embeddings/vector_spec.rb

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,10 @@ def stub_vector_mapping(text, expected_embedding)
9999

100100
it_behaves_like "generates and store embeddings using a vector definition"
101101

102-
context "when working with models that support shortening embeddings" do
102+
context "when matryoshka_dimensions is enabled" do
103103
it "passes the dimensions param" do
104104
shorter_dimensions = 10
105-
vdef.update!(
106-
dimensions: shorter_dimensions,
107-
provider_params: {
108-
model_name: "text-embedding-3-small",
109-
},
110-
)
105+
vdef.update!(dimensions: shorter_dimensions, matryoshka_dimensions: true)
111106
text = "This is a piece of text"
112107
short_expected_embedding = [0.0038493] * shorter_dimensions
113108

spec/requests/admin/ai_embeddings_controller_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
tokenizer_class: "DiscourseAi::Tokenizer::BgeM3Tokenizer",
1818
embed_prompt: "I come first:",
1919
search_prompt: "prefix for search",
20+
matryoshka_dimensions: true,
2021
}
2122
end
2223

@@ -31,6 +32,7 @@
3132
expect(created_def.display_name).to eq(valid_attrs[:display_name])
3233
expect(created_def.embed_prompt).to eq(valid_attrs[:embed_prompt])
3334
expect(created_def.search_prompt).to eq(valid_attrs[:search_prompt])
35+
expect(created_def.matryoshka_dimensions).to eq(true)
3436
end
3537

3638
it "stores provider-specific config params" do

0 commit comments

Comments
 (0)