Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 32ac63b

Browse files
committed
FEATURE: Backfill posts sentiment.
It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age.
1 parent 0cb2c41 commit 32ac63b

File tree

7 files changed

+220
-15
lines changed

7 files changed

+220
-15
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# frozen_string_literal: true
2+
3+
module Jobs
4+
class SentimentBackfill < ::Jobs::Scheduled
5+
every 5.minutes
6+
cluster_concurrency 1
7+
8+
def execute(_args)
9+
return unless SiteSetting.ai_sentiment_enabled
10+
11+
base_budget = SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour
12+
return if base_budget.zero?
13+
# Split budget in 12 intervals, but make sure is at least one.
14+
#
15+
# This is not exact as we don't have a way of tracking how many
16+
# posts we classified in the current hour, but it's a good enough approximation.
17+
limit_per_job = [base_budget, 12].max / 12
18+
19+
classificator = DiscourseAi::Sentiment::PostClassification.new
20+
21+
posts =
22+
DiscourseAi::Sentiment::PostClassification.backfill_query(
23+
max_age_days: SiteSetting.ai_sentiment_backfill_post_max_age_days,
24+
).limit(limit_per_job)
25+
26+
classificator.bulk_classify!(posts)
27+
end
28+
end
29+
end

config/settings.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,15 @@ discourse_ai:
6666
ai_sentiment_model_configs:
6767
default: ""
6868
json_schema: DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
69+
ai_sentiment_backfill_maximum_posts_per_hour:
70+
default: 0
71+
min: 0
72+
max: 10000
73+
hidden: true
74+
ai_sentiment_backfill_post_max_age_days:
75+
default: 60
76+
hidden: true
77+
6978

7079
ai_nsfw_detection_enabled:
7180
default: false

lib/sentiment/post_classification.rb

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,33 @@
33
module DiscourseAi
44
module Sentiment
55
class PostClassification
6+
def self.backfill_query(from_post_id: nil, max_age_days: nil)
7+
public_categories = Category.where(read_restricted: false).pluck(:id)
8+
9+
base_query =
10+
Post
11+
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
12+
.joins(
13+
"LEFT JOIN classification_results ON classification_results.target_id = posts.id AND classification_results.target_type = 'Post'",
14+
)
15+
.where("classification_results.target_id IS NULL")
16+
.where("category_id IN (?)", public_categories)
17+
.where(post_type: Post.types[:regular])
18+
.where(posts: { deleted_at: nil })
19+
.where(topics: { deleted_at: nil })
20+
21+
base_query = base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?
22+
23+
if max_age_days.present?
24+
base_query =
25+
base_query.where(
26+
"posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",
27+
)
28+
end
29+
30+
base_query
31+
end
32+
633
def bulk_classify!(relation)
734
http_pool_size = 100
835
pool =
@@ -13,6 +40,7 @@ def bulk_classify!(relation)
1340
)
1441

1542
available_classifiers = classifiers
43+
return if available_classifiers.blank?
1644
base_url = Discourse.base_url
1745

1846
promised_classifications =
@@ -52,6 +80,7 @@ def bulk_classify!(relation)
5280

5381
def classify!(target)
5482
return if target.blank?
83+
return if classifiers.blank?
5584

5685
to_classify = prepare_text(target)
5786
return if to_classify.blank?
@@ -79,7 +108,7 @@ def prepare_text(target)
79108
end
80109

81110
def classifiers
82-
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values
111+
@classifiers ||= DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values
83112
end
84113

85114
def request_with(content, config, base_url = Discourse.base_url)

lib/sentiment/sentiment_site_setting_json_schema.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def self.schema
2727
end
2828

2929
def self.values
30+
return {} if SiteSetting.ai_sentiment_model_configs.blank?
3031
JSON.parse(SiteSetting.ai_sentiment_model_configs, object_class: OpenStruct)
3132
end
3233
end

lib/tasks/modules/sentiment/backfill.rake

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,8 @@
22

33
desc "Backfill sentiment for all posts"
44
task "ai:sentiment:backfill", [:start_post] => [:environment] do |_, args|
5-
public_categories = Category.where(read_restricted: false).pluck(:id)
6-
7-
Post
8-
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
9-
.joins(
10-
"LEFT JOIN classification_results ON classification_results.target_id = posts.id AND classification_results.target_type = 'Post'",
11-
)
12-
.where("classification_results.target_id IS NULL")
13-
.where("posts.id >= ?", args[:start_post].to_i || 0)
14-
.where("category_id IN (?)", public_categories)
15-
.where(posts: { deleted_at: nil })
16-
.where(topics: { deleted_at: nil })
5+
DiscourseAi::Sentiment::PostClassification
6+
.backfill_query(from_post_id: args[:start_post].to_i)
177
.find_in_batches do |batch|
188
print "."
199
DiscourseAi::Sentiment::PostClassification.new.bulk_classify!(batch)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# frozen_string_literal: true
2+
3+
require_relative "../../support/sentiment_inference_stubs"
4+
5+
RSpec.describe Jobs::SentimentBackfill do
6+
describe "#execute" do
7+
fab!(:post)
8+
9+
before do
10+
SiteSetting.ai_sentiment_enabled = true
11+
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 100
12+
SiteSetting.ai_sentiment_model_configs =
13+
"[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
14+
end
15+
16+
let(:expected_analysis) { DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.length }
17+
18+
it "backfills when settings are correct" do
19+
SentimentInferenceStubs.stub_classification(post)
20+
subject.execute({})
21+
22+
expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
23+
end
24+
25+
it "does nothing when batch size is zero" do
26+
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 0
27+
28+
subject.execute({})
29+
30+
expect(ClassificationResult.count).to be_zero
31+
end
32+
33+
it "does nothing when sentiment is disabled" do
34+
SiteSetting.ai_sentiment_enabled = false
35+
36+
subject.execute({})
37+
38+
expect(ClassificationResult.count).to be_zero
39+
end
40+
41+
it "respects the ai_sentiment_backfill_post_max_age_days setting" do
42+
SentimentInferenceStubs.stub_classification(post)
43+
SiteSetting.ai_sentiment_backfill_post_max_age_days = 80
44+
post_2 = Fabricate(:post, created_at: 81.days.ago)
45+
46+
subject.execute({})
47+
48+
expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
49+
expect(ClassificationResult.where(target: post_2).count).to be_zero
50+
end
51+
end
52+
end

spec/lib/modules/sentiment/post_classification_spec.rb

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
require_relative "../../../support/sentiment_inference_stubs"
44

55
RSpec.describe DiscourseAi::Sentiment::PostClassification do
6-
fab!(:post_1) { Fabricate(:post, post_number: 2) }
7-
86
before do
97
SiteSetting.ai_sentiment_enabled = true
108
SiteSetting.ai_sentiment_model_configs =
119
"[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
1210
end
1311

1412
describe "#classify!" do
13+
fab!(:post_1) { Fabricate(:post, post_number: 2) }
14+
1515
it "does nothing if the post content is blank" do
1616
post_1.update_columns(raw: "")
1717

@@ -28,9 +28,18 @@
2828

2929
expect(ClassificationResult.where(target: post_1).count).to eq(expected_analysis)
3030
end
31+
32+
it "does nothing if there are no classification model" do
33+
SiteSetting.ai_sentiment_model_configs = ""
34+
35+
subject.classify!(post_1)
36+
37+
expect(ClassificationResult.where(target: post_1).count).to be_zero
38+
end
3139
end
3240

3341
describe "#classify_bulk!" do
42+
fab!(:post_1) { Fabricate(:post, post_number: 2) }
3443
fab!(:post_2) { Fabricate(:post, post_number: 2) }
3544

3645
it "classifies all given posts" do
@@ -43,5 +52,91 @@
4352
expect(ClassificationResult.where(target: post_1).count).to eq(expected_analysis)
4453
expect(ClassificationResult.where(target: post_2).count).to eq(expected_analysis)
4554
end
55+
56+
it "does nothing if there are no classification model" do
57+
SiteSetting.ai_sentiment_model_configs = ""
58+
59+
subject.bulk_classify!(Post.where(id: [post_1.id, post_2.id]))
60+
61+
expect(ClassificationResult.where(target: post_1).count).to be_zero
62+
expect(ClassificationResult.where(target: post_2).count).to be_zero
63+
end
64+
end
65+
66+
describe ".backfill_query" do
67+
it "excludes posts in personal messages" do
68+
Fabricate(:private_message_post)
69+
70+
posts = described_class.backfill_query
71+
72+
expect(posts).to be_empty
73+
end
74+
75+
it "excludes posts in restricted categories" do
76+
sec_cat = Fabricate(:category, read_restricted: true)
77+
topic = Fabricate(:topic, category: sec_cat)
78+
Fabricate(:post, topic: topic)
79+
80+
posts = described_class.backfill_query
81+
82+
expect(posts).to be_empty
83+
end
84+
85+
it "includes regular posts only" do
86+
Fabricate(:small_action)
87+
88+
posts = described_class.backfill_query
89+
90+
expect(posts).to be_empty
91+
end
92+
93+
it "excludes posts from deleted topics" do
94+
topic = Fabricate(:topic, deleted_at: 1.hour.ago)
95+
Fabricate(:post, topic: topic)
96+
97+
posts = described_class.backfill_query
98+
99+
expect(posts).to be_empty
100+
end
101+
102+
it "excludes deleted posts" do
103+
Fabricate(:post, deleted_at: 1.hour.ago)
104+
105+
posts = described_class.backfill_query
106+
107+
expect(posts).to be_empty
108+
end
109+
110+
context "with max_age_days" do
111+
fab!(:age_post) { Fabricate(:post, created_at: 3.days.ago) }
112+
113+
it "includes a post when is younger" do
114+
posts = described_class.backfill_query(max_age_days: 4)
115+
116+
expect(posts).to contain_exactly(age_post)
117+
end
118+
119+
it "excludes posts when it's older" do
120+
posts = described_class.backfill_query(max_age_days: 2)
121+
122+
expect(posts).to be_empty
123+
end
124+
end
125+
126+
context "with from_post_id" do
127+
fab!(:post)
128+
129+
it "includes post if ID is higher" do
130+
posts = described_class.backfill_query(from_post_id: post.id - 1)
131+
132+
expect(posts).to contain_exactly(post)
133+
end
134+
135+
it "excludes post if ID is lower" do
136+
posts = described_class.backfill_query(from_post_id: post.id + 1)
137+
138+
expect(posts).to be_empty
139+
end
140+
end
46141
end
47142
end

0 commit comments

Comments
 (0)