Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 6cc6122

Browse files
committed
FEATURE: exclude muted categories from category suggester
The logic here is that users do not particularly care about topics in the category so we can exclude them from tag and category suggestions
1 parent 80adefa commit 6cc6122

File tree

4 files changed

+116
-4
lines changed

4 files changed

+116
-4
lines changed

lib/ai_helper/semantic_categorizer.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ def categories
1212
return [] unless SiteSetting.ai_embeddings_enabled
1313

1414
candidates = nearest_neighbors(limit: 100)
15+
return [] if candidates.empty?
16+
1517
candidate_ids = candidates.map(&:first)
1618

1719
::Topic
@@ -52,6 +54,8 @@ def tags
5254
return [] unless SiteSetting.ai_embeddings_enabled
5355

5456
candidates = nearest_neighbors(limit: 100)
57+
return [] if candidates.empty?
58+
5559
candidate_ids = candidates.map(&:first)
5660

5761
count_column = Tag.topic_count_column(@user.guardian) # Determine the count column
@@ -94,11 +98,21 @@ def nearest_neighbors(limit: 100)
9498

9599
raw_vector = vector_rep.vector_from(@text)
96100

101+
muted_category_ids = nil
102+
if @user.present?
103+
muted_category_ids =
104+
CategoryUser.where(
105+
user: @user,
106+
notification_level: CategoryUser.notification_levels[:muted],
107+
).pluck(:category_id)
108+
end
109+
97110
vector_rep.asymmetric_topics_similarity_search(
98111
raw_vector,
99112
limit: limit,
100113
offset: 0,
101114
return_distance: true,
115+
exclude_category_ids: muted_category_ids,
102116
)
103117
end
104118
end

lib/embeddings/vector_representations/base.rb

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,16 +151,22 @@ def post_id_from_representation(raw_vector)
151151
SQL
152152
end
153153

154-
def asymmetric_topics_similarity_search(raw_vector, limit:, offset:, return_distance: false)
155-
results = DB.query(<<~SQL, query_embedding: raw_vector, limit: limit, offset: offset)
154+
def asymmetric_topics_similarity_search(
155+
raw_vector,
156+
limit:,
157+
offset:,
158+
return_distance: false,
159+
exclude_category_ids: nil
160+
)
161+
builder = DB.build(<<~SQL)
156162
WITH candidates AS (
157163
SELECT
158164
topic_id,
159165
embeddings::halfvec(#{dimensions}) AS embeddings
160166
FROM
161167
#{topic_table_name}
162-
WHERE
163-
model_id = #{id} AND strategy_id = #{@strategy.id}
168+
/*join*/
169+
/*where*/
164170
ORDER BY
165171
binary_quantize(embeddings)::bit(#{dimensions}) <~> binary_quantize('[:query_embedding]'::halfvec(#{dimensions}))
166172
LIMIT :limit * 2
@@ -176,6 +182,22 @@ def asymmetric_topics_similarity_search(raw_vector, limit:, offset:, return_dist
176182
OFFSET :offset
177183
SQL
178184

185+
builder.where(
186+
"model_id = :model_id AND strategy_id = :strategy_id",
187+
model_id: id,
188+
strategy_id: @strategy.id,
189+
)
190+
191+
if exclude_category_ids.present?
192+
builder.join("topics t on t.id = topic_id")
193+
builder.where(<<~SQL, exclude_category_ids: exclude_category_ids.map(&:to_i))
194+
t.category_id NOT IN (:exclude_category_ids) AND
195+
t.category_id NOT IN (SELECT categories.id FROM categories WHERE categories.parent_category_id IN (:exclude_category_ids))
196+
SQL
197+
end
198+
199+
results = builder.query(query_embedding: raw_vector, limit: limit, offset: offset)
200+
179201
if return_distance
180202
results.map { |r| [r.topic_id, r.distance] }
181203
else
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe DiscourseAi::AiHelper::SemanticCategorizer do
4+
fab!(:user)
5+
fab!(:muted_category) { Fabricate(:category) }
6+
fab!(:category_mute) do
7+
CategoryUser.create!(
8+
user: user,
9+
category: muted_category,
10+
notification_level: CategoryUser.notification_levels[:muted],
11+
)
12+
end
13+
fab!(:muted_topic) { Fabricate(:topic, category: muted_category) }
14+
fab!(:category) { Fabricate(:category) }
15+
fab!(:topic) { Fabricate(:topic, category: category) }
16+
17+
let(:truncation) { DiscourseAi::Embeddings::Strategies::Truncation.new }
18+
let(:vector_rep) do
19+
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(truncation)
20+
end
21+
let(:categorizer) { DiscourseAi::AiHelper::SemanticCategorizer.new({ text: "hello" }, user) }
22+
let(:expected_embedding) { [0.0038493] * vector_rep.dimensions }
23+
24+
before do
25+
SiteSetting.ai_embeddings_enabled = true
26+
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
27+
SiteSetting.ai_embeddings_model = "bge-large-en"
28+
29+
WebMock.stub_request(
30+
:post,
31+
"#{SiteSetting.ai_embeddings_discourse_service_api_endpoint}/api/v1/classify",
32+
).to_return(status: 200, body: JSON.dump(expected_embedding))
33+
34+
vector_rep.generate_representation_from(topic)
35+
vector_rep.generate_representation_from(muted_topic)
36+
end
37+
38+
it "respects user muted categories when making suggestions" do
39+
category_ids = categorizer.categories.map { |c| c[:id] }
40+
expect(category_ids).not_to include(muted_category.id)
41+
expect(category_ids).to include(category.id)
42+
end
43+
end

spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,38 @@
104104
vector_rep.asymmetric_topics_similarity_search(similar_vector, limit: 1, offset: 0),
105105
).to contain_exactly(topic.id)
106106
end
107+
108+
it "can exclude categories" do
109+
similar_vector = [0.0038494] * vector_rep.dimensions
110+
text =
111+
truncation.prepare_text_from(
112+
topic,
113+
vector_rep.tokenizer,
114+
vector_rep.max_sequence_length - 2,
115+
)
116+
stub_vector_mapping(text, expected_embedding_1)
117+
vector_rep.generate_representation_from(topic)
118+
119+
expect(
120+
vector_rep.asymmetric_topics_similarity_search(
121+
similar_vector,
122+
limit: 1,
123+
offset: 0,
124+
exclude_category_ids: [topic.category_id],
125+
),
126+
).to be_empty
127+
128+
child_category = Fabricate(:category, parent_category_id: topic.category_id)
129+
topic.update!(category_id: child_category.id)
130+
131+
expect(
132+
vector_rep.asymmetric_topics_similarity_search(
133+
similar_vector,
134+
limit: 1,
135+
offset: 0,
136+
exclude_category_ids: [topic.category_id],
137+
),
138+
).to be_empty
139+
end
107140
end
108141
end

0 commit comments

Comments
 (0)