Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 0cb2c41

Browse files
authored
FEATURE: exclude muted categories from category suggester (#979)
The logic here is that users do not particularly care about topics in the category so we can exclude them from tag and category suggestions
1 parent 80adefa commit 0cb2c41

File tree

4 files changed

+116
-4
lines changed

4 files changed

+116
-4
lines changed

lib/ai_helper/semantic_categorizer.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ def categories
1212
return [] unless SiteSetting.ai_embeddings_enabled
1313

1414
candidates = nearest_neighbors(limit: 100)
15+
return [] if candidates.empty?
16+
1517
candidate_ids = candidates.map(&:first)
1618

1719
::Topic
@@ -52,6 +54,8 @@ def tags
5254
return [] unless SiteSetting.ai_embeddings_enabled
5355

5456
candidates = nearest_neighbors(limit: 100)
57+
return [] if candidates.empty?
58+
5559
candidate_ids = candidates.map(&:first)
5660

5761
count_column = Tag.topic_count_column(@user.guardian) # Determine the count column
@@ -94,11 +98,21 @@ def nearest_neighbors(limit: 100)
9498

9599
raw_vector = vector_rep.vector_from(@text)
96100

101+
muted_category_ids = nil
102+
if @user.present?
103+
muted_category_ids =
104+
CategoryUser.where(
105+
user: @user,
106+
notification_level: CategoryUser.notification_levels[:muted],
107+
).pluck(:category_id)
108+
end
109+
97110
vector_rep.asymmetric_topics_similarity_search(
98111
raw_vector,
99112
limit: limit,
100113
offset: 0,
101114
return_distance: true,
115+
exclude_category_ids: muted_category_ids,
102116
)
103117
end
104118
end

lib/embeddings/vector_representations/base.rb

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,16 +151,22 @@ def post_id_from_representation(raw_vector)
151151
SQL
152152
end
153153

154-
def asymmetric_topics_similarity_search(raw_vector, limit:, offset:, return_distance: false)
155-
results = DB.query(<<~SQL, query_embedding: raw_vector, limit: limit, offset: offset)
154+
def asymmetric_topics_similarity_search(
155+
raw_vector,
156+
limit:,
157+
offset:,
158+
return_distance: false,
159+
exclude_category_ids: nil
160+
)
161+
builder = DB.build(<<~SQL)
156162
WITH candidates AS (
157163
SELECT
158164
topic_id,
159165
embeddings::halfvec(#{dimensions}) AS embeddings
160166
FROM
161167
#{topic_table_name}
162-
WHERE
163-
model_id = #{id} AND strategy_id = #{@strategy.id}
168+
/*join*/
169+
/*where*/
164170
ORDER BY
165171
binary_quantize(embeddings)::bit(#{dimensions}) <~> binary_quantize('[:query_embedding]'::halfvec(#{dimensions}))
166172
LIMIT :limit * 2
@@ -176,6 +182,22 @@ def asymmetric_topics_similarity_search(raw_vector, limit:, offset:, return_dist
176182
OFFSET :offset
177183
SQL
178184

185+
builder.where(
186+
"model_id = :model_id AND strategy_id = :strategy_id",
187+
model_id: id,
188+
strategy_id: @strategy.id,
189+
)
190+
191+
if exclude_category_ids.present?
192+
builder.join("topics t on t.id = topic_id")
193+
builder.where(<<~SQL, exclude_category_ids: exclude_category_ids.map(&:to_i))
194+
t.category_id NOT IN (:exclude_category_ids) AND
195+
t.category_id NOT IN (SELECT categories.id FROM categories WHERE categories.parent_category_id IN (:exclude_category_ids))
196+
SQL
197+
end
198+
199+
results = builder.query(query_embedding: raw_vector, limit: limit, offset: offset)
200+
179201
if return_distance
180202
results.map { |r| [r.topic_id, r.distance] }
181203
else
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# frozen_string_literal: true
2+
3+
RSpec.describe DiscourseAi::AiHelper::SemanticCategorizer do
4+
fab!(:user)
5+
fab!(:muted_category) { Fabricate(:category) }
6+
fab!(:category_mute) do
7+
CategoryUser.create!(
8+
user: user,
9+
category: muted_category,
10+
notification_level: CategoryUser.notification_levels[:muted],
11+
)
12+
end
13+
fab!(:muted_topic) { Fabricate(:topic, category: muted_category) }
14+
fab!(:category)
15+
fab!(:topic) { Fabricate(:topic, category: category) }
16+
17+
let(:truncation) { DiscourseAi::Embeddings::Strategies::Truncation.new }
18+
let(:vector_rep) do
19+
DiscourseAi::Embeddings::VectorRepresentations::Base.current_representation(truncation)
20+
end
21+
let(:categorizer) { DiscourseAi::AiHelper::SemanticCategorizer.new({ text: "hello" }, user) }
22+
let(:expected_embedding) { [0.0038493] * vector_rep.dimensions }
23+
24+
before do
25+
SiteSetting.ai_embeddings_enabled = true
26+
SiteSetting.ai_embeddings_discourse_service_api_endpoint = "http://test.com"
27+
SiteSetting.ai_embeddings_model = "bge-large-en"
28+
29+
WebMock.stub_request(
30+
:post,
31+
"#{SiteSetting.ai_embeddings_discourse_service_api_endpoint}/api/v1/classify",
32+
).to_return(status: 200, body: JSON.dump(expected_embedding))
33+
34+
vector_rep.generate_representation_from(topic)
35+
vector_rep.generate_representation_from(muted_topic)
36+
end
37+
38+
it "respects user muted categories when making suggestions" do
39+
category_ids = categorizer.categories.map { |c| c[:id] }
40+
expect(category_ids).not_to include(muted_category.id)
41+
expect(category_ids).to include(category.id)
42+
end
43+
end

spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,38 @@
104104
vector_rep.asymmetric_topics_similarity_search(similar_vector, limit: 1, offset: 0),
105105
).to contain_exactly(topic.id)
106106
end
107+
108+
it "can exclude categories" do
109+
similar_vector = [0.0038494] * vector_rep.dimensions
110+
text =
111+
truncation.prepare_text_from(
112+
topic,
113+
vector_rep.tokenizer,
114+
vector_rep.max_sequence_length - 2,
115+
)
116+
stub_vector_mapping(text, expected_embedding_1)
117+
vector_rep.generate_representation_from(topic)
118+
119+
expect(
120+
vector_rep.asymmetric_topics_similarity_search(
121+
similar_vector,
122+
limit: 1,
123+
offset: 0,
124+
exclude_category_ids: [topic.category_id],
125+
),
126+
).to be_empty
127+
128+
child_category = Fabricate(:category, parent_category_id: topic.category_id)
129+
topic.update!(category_id: child_category.id)
130+
131+
expect(
132+
vector_rep.asymmetric_topics_similarity_search(
133+
similar_vector,
134+
limit: 1,
135+
offset: 0,
136+
exclude_category_ids: [topic.category_id],
137+
),
138+
).to be_empty
139+
end
107140
end
108141
end

0 commit comments

Comments
 (0)