Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit ad94cb7

Browse files
committed
Dedup concepts
1 parent 5fe78aa commit ad94cb7

File tree

8 files changed

+216
-68
lines changed

8 files changed

+216
-68
lines changed

app/jobs/scheduled/generate_concepts_from_popular_items.rb

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -6,76 +6,80 @@ class GenerateConceptsFromPopularItems < ::Jobs::Scheduled
66

77
# This job runs daily and generates new concepts from popular topics and posts
88
# It selects items based on engagement metrics and generates concepts from their content
9-
def execute(args = {})
9+
def execute(_args)
1010
return unless SiteSetting.inferred_concepts_enabled
1111

1212
process_popular_topics
1313
process_popular_posts
1414
end
15-
15+
1616
private
17-
18-
def process_popular_topics
1917

18+
def process_popular_topics
2019
# Find candidate topics that are popular and don't have concepts yet
21-
candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
22-
limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
23-
min_posts: SiteSetting.inferred_concepts_min_posts || 5,
24-
min_likes: SiteSetting.inferred_concepts_min_likes || 10,
25-
min_views: SiteSetting.inferred_concepts_min_views || 100,
26-
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
27-
)
20+
candidates =
21+
DiscourseAi::InferredConcepts::Manager.find_candidate_topics(
22+
limit: SiteSetting.inferred_concepts_daily_topics_limit || 20,
23+
min_posts: SiteSetting.inferred_concepts_min_posts || 5,
24+
min_likes: SiteSetting.inferred_concepts_min_likes || 10,
25+
min_views: SiteSetting.inferred_concepts_min_views || 100,
26+
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago,
27+
)
2828

2929
return if candidates.blank?
30-
30+
3131
# Process candidate topics - first generate concepts, then match
3232
Jobs.enqueue(
3333
:generate_inferred_concepts,
34-
item_type: 'topics',
35-
item_ids: candidates.map(&:id),
36-
batch_size: 10
37-
)
38-
39-
# Schedule a follow-up job to match existing concepts
40-
Jobs.enqueue_in(
41-
1.hour,
42-
:generate_inferred_concepts,
43-
item_type: 'topics',
34+
item_type: "topics",
4435
item_ids: candidates.map(&:id),
4536
batch_size: 10,
46-
match_only: true
4737
)
38+
39+
if SiteSetting.inferred_concepts_background_match
40+
# Schedule a follow-up job to match existing concepts
41+
Jobs.enqueue_in(
42+
1.hour,
43+
:generate_inferred_concepts,
44+
item_type: "topics",
45+
item_ids: candidates.map(&:id),
46+
batch_size: 10,
47+
match_only: true,
48+
)
49+
end
4850
end
49-
50-
def process_popular_posts
5151

52+
def process_popular_posts
5253
# Find candidate posts that are popular and don't have concepts yet
53-
candidates = DiscourseAi::InferredConcepts::Manager.find_candidate_posts(
54-
limit: SiteSetting.inferred_concepts_daily_posts_limit || 30,
55-
min_likes: SiteSetting.inferred_concepts_post_min_likes || 5,
56-
exclude_first_posts: true,
57-
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago
58-
)
54+
candidates =
55+
DiscourseAi::InferredConcepts::Manager.find_candidate_posts(
56+
limit: SiteSetting.inferred_concepts_daily_posts_limit || 30,
57+
min_likes: SiteSetting.inferred_concepts_post_min_likes || 5,
58+
exclude_first_posts: true,
59+
created_after: SiteSetting.inferred_concepts_lookback_days.days.ago,
60+
)
5961

6062
return if candidates.blank?
61-
63+
6264
# Process candidate posts - first generate concepts, then match
6365
Jobs.enqueue(
6466
:generate_inferred_concepts,
65-
item_type: 'posts',
66-
item_ids: candidates.map(&:id),
67-
batch_size: 10
68-
)
69-
70-
# Schedule a follow-up job to match against existing concepts
71-
Jobs.enqueue_in(
72-
1.hour,
73-
:generate_inferred_concepts,
74-
item_type: 'posts',
67+
item_type: "posts",
7568
item_ids: candidates.map(&:id),
7669
batch_size: 10,
77-
match_only: true
7870
)
71+
72+
if SiteSetting.inferred_concepts_background_match
73+
# Schedule a follow-up job to match against existing concepts
74+
Jobs.enqueue_in(
75+
1.hour,
76+
:generate_inferred_concepts,
77+
item_type: "posts",
78+
item_ids: candidates.map(&:id),
79+
batch_size: 10,
80+
match_only: true,
81+
)
82+
end
7983
end
8084
end
81-
end
85+
end

config/settings.yml

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -421,32 +421,39 @@ discourse_ai:
421421
inferred_concepts_enabled:
422422
default: false
423423
client: true
424-
description: "Enable the inferred concepts system that automatically generates and applies concepts to topics"
424+
inferred_concepts_background_match:
425+
default: false
426+
client: false
425427
inferred_concepts_daily_topics_limit:
426428
default: 20
427429
client: false
428-
description: "Maximum number of topics to process each day for concept generation"
429430
inferred_concepts_min_posts:
430431
default: 5
431432
client: false
432-
description: "Minimum number of posts a topic must have to be considered for concept generation"
433433
inferred_concepts_min_likes:
434434
default: 10
435435
client: false
436-
description: "Minimum number of likes a topic must have to be considered for concept generation"
437436
inferred_concepts_min_views:
438437
default: 100
439438
client: false
440-
description: "Minimum number of views a topic must have to be considered for concept generation"
441439
inferred_concepts_lookback_days:
442440
default: 30
443441
client: false
444-
description: "Only consider topics created within this many days for concept generation"
445442
inferred_concepts_daily_posts_limit:
446443
default: 30
447444
client: false
448-
description: "Maximum number of posts to process each day for concept generation"
449445
inferred_concepts_post_min_likes:
450446
default: 5
451447
client: false
452-
description: "Minimum number of likes a post must have to be considered for concept generation"
448+
inferred_concepts_generate_persona:
449+
default: "-15"
450+
type: enum
451+
enum: "DiscourseAi::Configuration::PersonaEnumerator"
452+
inferred_concepts_match_persona:
453+
default: "-16"
454+
type: enum
455+
enum: "DiscourseAi::Configuration::PersonaEnumerator"
456+
inferred_concepts_deduplicate_persona:
457+
default: "-17"
458+
type: enum
459+
enum: "DiscourseAi::Configuration::PersonaEnumerator"

db/migrate/20250508183456_create_topics_inferred_concepts.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@
22

33
class CreateTopicsInferredConcepts < ActiveRecord::Migration[7.0]
44
def change
5-
create_table :topics_inferred_concepts do |t|
5+
create_table :topics_inferred_concepts, primary_key: %i[topic_id inferred_concept_id] do |t|
66
t.integer :topic_id, null: false
77
t.integer :inferred_concept_id, null: false
88
t.timestamps
99
end
1010

11-
add_index :topics_inferred_concepts, [:topic_id, :inferred_concept_id], unique: true, name: 'idx_unique_topic_inferred_concept'
12-
add_index :topics_inferred_concepts, :topic_id
1311
add_index :topics_inferred_concepts, :inferred_concept_id
1412
end
15-
end
13+
end

db/migrate/20250509000001_create_posts_inferred_concepts.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,12 @@
22

33
class CreatePostsInferredConcepts < ActiveRecord::Migration[7.0]
44
def change
5-
create_table :posts_inferred_concepts do |t|
5+
create_table :posts_inferred_concepts, primary_key: %i[post_id inferred_concept_id] do |t|
66
t.integer :post_id, null: false
77
t.integer :inferred_concept_id, null: false
88
t.timestamps
99
end
1010

11-
add_index :posts_inferred_concepts, [:post_id, :inferred_concept_id], unique: true, name: 'idx_unique_post_inferred_concept'
12-
add_index :posts_inferred_concepts, :post_id
1311
add_index :posts_inferred_concepts, :inferred_concept_id
1412
end
15-
end
13+
end

lib/inferred_concepts/finder.rb

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,25 @@ def self.identify_concepts(content)
99
return [] if content.blank?
1010

1111
# Use the ConceptFinder persona to identify concepts
12-
llm = DiscourseAi::Completions::Llm.default_llm
13-
persona = DiscourseAi::Personas::ConceptFinder.new
12+
persona =
13+
AiPersona
14+
.all_personas(enabled_only: false)
15+
.find { |persona| persona.id == SiteSetting.inferred_concepts_generate_persona.to_i }
16+
.new
17+
18+
llm = LlmModel.find(persona.class.default_llm_id)
1419
context =
1520
DiscourseAi::Personas::BotContext.new(
1621
messages: [{ type: :user, content: content }],
1722
user: Discourse.system_user,
1823
inferred_concepts: DiscourseAi::InferredConcepts::Manager.list_concepts,
1924
)
2025

21-
prompt = persona.craft_prompt(context)
22-
response = llm.completion(prompt, extract_json: true)
26+
bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm)
2327

24-
return [] unless response.success?
28+
response = bot.reply(context)
2529

26-
concepts = response.parsed_output["concepts"]
30+
concepts = JSON.parse(response[0][0]).dig("concepts")
2731
concepts || []
2832
end
2933

@@ -68,7 +72,7 @@ def self.find_candidate_topics(
6872
query = query.where("topics.created_at >= ?", created_after) if created_after.present?
6973

7074
# Exclude PM topics (if they exist in Discourse)
71-
query = query.where(archetype: Topic.public_archetype)
75+
query = query.where(archetype: Archetype.default)
7276

7377
# Exclude topics that already have concepts
7478
topics_with_concepts = <<~SQL
@@ -134,6 +138,34 @@ def self.find_candidate_posts(
134138
# Return limited number of posts
135139
query.limit(limit)
136140
end
141+
142+
# Deduplicate and standardize a list of concepts
143+
# @param concept_names [Array<String>] List of concept names to deduplicate
144+
# @return [Hash] Hash with deduplicated concepts and mapping
145+
def self.deduplicate_concepts(concept_names)
146+
return { deduplicated_concepts: [], mapping: {} } if concept_names.blank?
147+
148+
# Use the ConceptDeduplicator persona to deduplicate concepts
149+
persona =
150+
AiPersona
151+
.all_personas(enabled_only: false)
152+
.find { |persona| persona.id == SiteSetting.inferred_concepts_deduplicate_persona.to_i }
153+
.new
154+
155+
llm = LlmModel.find(persona.class.default_llm_id)
156+
157+
# Create the input for the deduplicator
158+
input = { type: :user, content: concept_names.join(", ") }
159+
160+
context =
161+
DiscourseAi::Personas::BotContext.new(messages: [input], user: Discourse.system_user)
162+
163+
bot = DiscourseAi::Personas::Bot.as(Discourse.system_user, persona: persona, model: llm)
164+
165+
response = bot.reply(context)
166+
167+
concepts = JSON.parse(response[0][0]).dig("streamlined_tags")
168+
end
137169
end
138170
end
139171
end

lib/inferred_concepts/manager.rb

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,65 @@ def self.list_concepts(limit: nil)
1414

1515
query.pluck(:name)
1616
end
17+
18+
# Deduplicate concepts in batches by letter
19+
# This method will:
20+
# 1. Group concepts by first letter
21+
# 2. Process each letter group separately through the deduplicator
22+
# 3. Do a final pass with all deduplicated concepts
23+
# @return [Hash] Statistics about the deduplication process
24+
def self.deduplicate_concepts_by_letter(per_letter_batch: 50, full_pass_batch: 150)
25+
# Get all concepts
26+
all_concepts = list_concepts
27+
return if all_concepts.empty?
28+
29+
letter_groups = Hash.new { |h, k| h[k] = [] }
30+
31+
# Group concepts by first letter
32+
all_concepts.each do |concept|
33+
first_char = concept[0]&.upcase
34+
35+
if first_char && first_char.match?(/[A-Z]/)
36+
letter_groups[first_char] << concept
37+
else
38+
# Non-alphabetic or empty concepts go in a special group
39+
letter_groups["#"] << concept
40+
end
41+
end
42+
43+
# Process each letter group
44+
letter_deduplicated_concepts = []
45+
46+
letter_groups.each do |letter, concepts|
47+
next if concepts.empty?
48+
49+
batches = concepts.each_slice(per_letter_batch).to_a
50+
51+
batches.each do |batch|
52+
result = Finder.deduplicate_concepts(batch)
53+
letter_deduplicated_concepts.concat(result)
54+
end
55+
end
56+
57+
# Final pass with all deduplicated concepts
58+
if letter_deduplicated_concepts.present?
59+
final_result = []
60+
61+
batches = letter_deduplicated_concepts.each_slice(full_pass_batch).to_a
62+
batches.each do |batch|
63+
dedups = Finder.deduplicate_concepts(batch)
64+
final_result.concat(dedups)
65+
end
66+
67+
# Remove duplicates
68+
final_result.uniq!
69+
70+
# Apply the deduplicated concepts
71+
InferredConcept.destroy_all
72+
InferredConcept.insert_all(final_result.map { { name: it } })
73+
end
74+
end
75+
1776
# Generate new concepts for a topic and apply them
1877
# @param topic [Topic] A Topic instance
1978
# @return [Array<InferredConcept>] The concepts that were applied
@@ -139,7 +198,7 @@ def self.match_content_to_concepts(content)
139198
# @option opts [DateTime] :created_after (30.days.ago) Only include topics created after this time
140199
# @return [Array<Topic>] Array of Topic objects that are good candidates
141200
def self.find_candidate_topics(opts = {})
142-
Finder.find_candidate_topics(opts)
201+
Finder.find_candidate_topics(**opts)
143202
end
144203

145204
# Find candidate posts that are good for concept generation

0 commit comments

Comments
 (0)