Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 9449a54

Browse files
committed
FEATURE: Backfill posts sentiment.
It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age.
1 parent 0cb2c41 commit 9449a54

File tree

10 files changed

+309
-20
lines changed

10 files changed

+309
-20
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# frozen_string_literal: true
2+
3+
module Jobs
4+
class SentimentBackfill < ::Jobs::Scheduled
5+
every 5.minutes
6+
cluster_concurrency 1
7+
8+
def execute(_args)
9+
return if !SiteSetting.ai_sentiment_enabled
10+
11+
base_budget = SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour
12+
return if base_budget.zero?
13+
# Split budget in 12 intervals, but make sure is at least one.
14+
#
15+
# This is not exact as we don't have a way of tracking how many
16+
# posts we classified in the current hour, but it's a good enough approximation.
17+
limit_per_job = [base_budget, 12].max / 12
18+
19+
classificator = DiscourseAi::Sentiment::PostClassification.new
20+
return if !classificator.has_classifiers?
21+
22+
posts =
23+
DiscourseAi::Sentiment::PostClassification.backfill_query(
24+
max_age_days: SiteSetting.ai_sentiment_backfill_post_max_age_days,
25+
).limit(limit_per_job)
26+
27+
classificator.bulk_classify!(posts)
28+
end
29+
end
30+
end

config/settings.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ discourse_ai:
6666
ai_sentiment_model_configs:
6767
default: ""
6868
json_schema: DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
69+
ai_sentiment_backfill_maximum_posts_per_hour:
70+
default: 250
71+
min: 0
72+
max: 10000
73+
hidden: true
74+
ai_sentiment_backfill_post_max_age_days:
75+
default: 60
76+
hidden: true
77+
6978

7079
ai_nsfw_detection_enabled:
7180
default: false

lib/post_extensions.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module PostExtensions
5+
extend ActiveSupport::Concern
6+
7+
prepended do
8+
has_many :classification_results, as: :target
9+
10+
has_many :sentiment_classifications,
11+
-> { where(classification_type: "sentiment") },
12+
class_name: "ClassificationResult",
13+
as: :target
14+
end
15+
end
16+
end

lib/sentiment/post_classification.rb

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,42 @@
33
module DiscourseAi
44
module Sentiment
55
class PostClassification
6+
def self.backfill_query(from_post_id: nil, max_age_days: nil)
7+
available_classifier_names =
8+
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.map(&:model_name).sort
9+
10+
base_query =
11+
Post
12+
.includes(:sentiment_classifications)
13+
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
14+
.where(post_type: Post.types[:regular])
15+
.where.not(topics: { archetype: Archetype.private_message })
16+
.where(posts: { deleted_at: nil })
17+
.where(topics: { deleted_at: nil })
18+
.joins(<<~SQL)
19+
LEFT JOIN classification_results crs
20+
ON crs.target_id = posts.id
21+
AND crs.target_type = 'Post'
22+
AND crs.classification_type = 'sentiment'
23+
SQL
24+
.group("posts.id")
25+
.having(<<~SQL, available_classifier_names)
26+
COUNT(crs.model_used) = 0
27+
OR array_agg(DISTINCT crs.model_used ORDER BY crs.model_used)::text[] IS DISTINCT FROM array[?]
28+
SQL
29+
30+
base_query = base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?
31+
32+
if max_age_days.present?
33+
base_query =
34+
base_query.where(
35+
"posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",
36+
)
37+
end
38+
39+
base_query
40+
end
41+
642
def bulk_classify!(relation)
743
http_pool_size = 100
844
pool =
@@ -13,6 +49,7 @@ def bulk_classify!(relation)
1349
)
1450

1551
available_classifiers = classifiers
52+
return if available_classifiers.blank?
1653
base_url = Discourse.base_url
1754

1855
promised_classifications =
@@ -25,9 +62,13 @@ def bulk_classify!(relation)
2562
.fulfilled_future({ target: record, text: text }, pool)
2663
.then_on(pool) do |w_text|
2764
results = Concurrent::Hash.new
65+
already_classified = w_text[:target].sentiment_classifications.map(&:model_used)
66+
67+
classifiers_for_target =
68+
available_classifiers.reject { |ac| already_classified.include?(ac.model_name) }
2869

2970
promised_target_results =
30-
available_classifiers.map do |c|
71+
classifiers_for_target.map do |c|
3172
Concurrent::Promises.future_on(pool) do
3273
results[c.model_name] = request_with(w_text[:text], c, base_url)
3374
end
@@ -52,19 +93,28 @@ def bulk_classify!(relation)
5293

5394
def classify!(target)
5495
return if target.blank?
96+
return if classifiers.blank?
5597

5698
to_classify = prepare_text(target)
5799
return if to_classify.blank?
58100

101+
already_classified = target.sentiment_classifications.map(&:model_used)
102+
classifiers_for_target =
103+
classifiers.reject { |ac| already_classified.include?(ac.model_name) }
104+
59105
results =
60-
classifiers.reduce({}) do |memo, model|
106+
classifiers_for_target.reduce({}) do |memo, model|
61107
memo[model.model_name] = request_with(to_classify, model)
62108
memo
63109
end
64110

65111
store_classification(target, results)
66112
end
67113

114+
def has_classifiers?
115+
classifiers.present?
116+
end
117+
68118
private
69119

70120
def prepare_text(target)

lib/sentiment/sentiment_site_setting_json_schema.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def self.schema
2727
end
2828

2929
def self.values
30+
return {} if SiteSetting.ai_sentiment_model_configs.blank?
3031
JSON.parse(SiteSetting.ai_sentiment_model_configs, object_class: OpenStruct)
3132
end
3233
end

lib/tasks/modules/sentiment/backfill.rake

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,8 @@
22

33
desc "Backfill sentiment for all posts"
44
task "ai:sentiment:backfill", [:start_post] => [:environment] do |_, args|
5-
public_categories = Category.where(read_restricted: false).pluck(:id)
6-
7-
Post
8-
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
9-
.joins(
10-
"LEFT JOIN classification_results ON classification_results.target_id = posts.id AND classification_results.target_type = 'Post'",
11-
)
12-
.where("classification_results.target_id IS NULL")
13-
.where("posts.id >= ?", args[:start_post].to_i || 0)
14-
.where("category_id IN (?)", public_categories)
15-
.where(posts: { deleted_at: nil })
16-
.where(topics: { deleted_at: nil })
5+
DiscourseAi::Sentiment::PostClassification
6+
.backfill_query(from_post_id: args[:start_post].to_i)
177
.find_in_batches do |batch|
188
print "."
199
DiscourseAi::Sentiment::PostClassification.new.bulk_classify!(batch)

plugin.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def self.public_asset_path(name)
9191
reloadable_patch do |plugin|
9292
Guardian.prepend DiscourseAi::GuardianExtensions
9393
Topic.prepend DiscourseAi::TopicExtensions
94+
Post.prepend DiscourseAi::PostExtensions
9495
end
9596

9697
register_modifier(:post_should_secure_uploads?) do |_, _, topic|
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# frozen_string_literal: true
2+
3+
require_relative "../../support/sentiment_inference_stubs"
4+
5+
RSpec.describe Jobs::SentimentBackfill do
6+
describe "#execute" do
7+
fab!(:post)
8+
9+
before do
10+
SiteSetting.ai_sentiment_enabled = true
11+
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 100
12+
SiteSetting.ai_sentiment_model_configs =
13+
"[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
14+
end
15+
16+
let(:expected_analysis) { DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.length }
17+
18+
it "backfills when settings are correct" do
19+
SentimentInferenceStubs.stub_classification(post)
20+
subject.execute({})
21+
22+
expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
23+
end
24+
25+
it "does nothing when batch size is zero" do
26+
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 0
27+
28+
subject.execute({})
29+
30+
expect(ClassificationResult.count).to be_zero
31+
end
32+
33+
it "does nothing when sentiment is disabled" do
34+
SiteSetting.ai_sentiment_enabled = false
35+
36+
subject.execute({})
37+
38+
expect(ClassificationResult.count).to be_zero
39+
end
40+
41+
it "respects the ai_sentiment_backfill_post_max_age_days setting" do
42+
SentimentInferenceStubs.stub_classification(post)
43+
SiteSetting.ai_sentiment_backfill_post_max_age_days = 80
44+
post_2 = Fabricate(:post, created_at: 81.days.ago)
45+
46+
subject.execute({})
47+
48+
expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
49+
expect(ClassificationResult.where(target: post_2).count).to be_zero
50+
end
51+
end
52+
end

0 commit comments

Comments
 (0)