Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit ce6a2ec

Browse files
authored
FEATURE: Backfill posts sentiment. (#982)
* FEATURE: Backfill posts sentiment. It adds a scheduled job to backfill posts' sentiment, similar to our existing rake task, but with two settings to control the batch size and posts' max-age. * Make sure model_name order is consistent.
1 parent 7c65dd1 commit ce6a2ec

File tree

10 files changed

+315
-17
lines changed

10 files changed

+315
-17
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# frozen_string_literal: true
2+
3+
module Jobs
4+
class SentimentBackfill < ::Jobs::Scheduled
5+
every 5.minutes
6+
cluster_concurrency 1
7+
8+
def execute(_args)
9+
return if !SiteSetting.ai_sentiment_enabled
10+
11+
base_budget = SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour
12+
return if base_budget.zero?
13+
# Split budget in 12 intervals, but make sure is at least one.
14+
#
15+
# This is not exact as we don't have a way of tracking how many
16+
# posts we classified in the current hour, but it's a good enough approximation.
17+
limit_per_job = [base_budget, 12].max / 12
18+
19+
classificator = DiscourseAi::Sentiment::PostClassification.new
20+
return if !classificator.has_classifiers?
21+
22+
posts =
23+
DiscourseAi::Sentiment::PostClassification.backfill_query(
24+
max_age_days: SiteSetting.ai_sentiment_backfill_post_max_age_days,
25+
).limit(limit_per_job)
26+
27+
classificator.bulk_classify!(posts)
28+
end
29+
end
30+
end

config/settings.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,15 @@ discourse_ai:
1717
ai_sentiment_model_configs:
1818
default: ""
1919
json_schema: DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
20+
ai_sentiment_backfill_maximum_posts_per_hour:
21+
default: 250
22+
min: 0
23+
max: 10000
24+
hidden: true
25+
ai_sentiment_backfill_post_max_age_days:
26+
default: 60
27+
hidden: true
28+
2029

2130
ai_openai_dall_e_3_url: "https://api.openai.com/v1/images/generations"
2231
ai_openai_embeddings_url: "https://api.openai.com/v1/embeddings"

lib/post_extensions.rb

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module PostExtensions
5+
extend ActiveSupport::Concern
6+
7+
prepended do
8+
has_many :classification_results, as: :target
9+
10+
has_many :sentiment_classifications,
11+
-> { where(classification_type: "sentiment") },
12+
class_name: "ClassificationResult",
13+
as: :target
14+
end
15+
end
16+
end

lib/sentiment/post_classification.rb

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,47 @@
33
module DiscourseAi
44
module Sentiment
55
class PostClassification
6+
def self.backfill_query(from_post_id: nil, max_age_days: nil)
7+
available_classifier_names =
8+
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
9+
.values
10+
.map { |mc| mc.model_name.downcase }
11+
.sort
12+
13+
base_query =
14+
Post
15+
.includes(:sentiment_classifications)
16+
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
17+
.where(post_type: Post.types[:regular])
18+
.where.not(topics: { archetype: Archetype.private_message })
19+
.where(posts: { deleted_at: nil })
20+
.where(topics: { deleted_at: nil })
21+
.joins(<<~SQL)
22+
LEFT JOIN classification_results crs
23+
ON crs.target_id = posts.id
24+
AND crs.target_type = 'Post'
25+
AND crs.classification_type = 'sentiment'
26+
SQL
27+
.group("posts.id")
28+
.having(<<~SQL, available_classifier_names)
29+
COUNT(crs.model_used) = 0
30+
OR array_agg(
31+
DISTINCT LOWER(crs.model_used) ORDER BY LOWER(crs.model_used)
32+
)::text[] IS DISTINCT FROM array[?]
33+
SQL
34+
35+
base_query = base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?
36+
37+
if max_age_days.present?
38+
base_query =
39+
base_query.where(
40+
"posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",
41+
)
42+
end
43+
44+
base_query
45+
end
46+
647
def bulk_classify!(relation)
748
http_pool_size = 100
849
pool =
@@ -13,6 +54,7 @@ def bulk_classify!(relation)
1354
)
1455

1556
available_classifiers = classifiers
57+
return if available_classifiers.blank?
1658
base_url = Discourse.base_url
1759

1860
promised_classifications =
@@ -25,9 +67,13 @@ def bulk_classify!(relation)
2567
.fulfilled_future({ target: record, text: text }, pool)
2668
.then_on(pool) do |w_text|
2769
results = Concurrent::Hash.new
70+
already_classified = w_text[:target].sentiment_classifications.map(&:model_used)
71+
72+
classifiers_for_target =
73+
available_classifiers.reject { |ac| already_classified.include?(ac.model_name) }
2874

2975
promised_target_results =
30-
available_classifiers.map do |c|
76+
classifiers_for_target.map do |c|
3177
Concurrent::Promises.future_on(pool) do
3278
results[c.model_name] = request_with(w_text[:text], c, base_url)
3379
end
@@ -52,12 +98,17 @@ def bulk_classify!(relation)
5298

5399
def classify!(target)
54100
return if target.blank?
101+
return if classifiers.blank?
55102

56103
to_classify = prepare_text(target)
57104
return if to_classify.blank?
58105

106+
already_classified = target.sentiment_classifications.map(&:model_used)
107+
classifiers_for_target =
108+
classifiers.reject { |ac| already_classified.include?(ac.model_name) }
109+
59110
results =
60-
classifiers.reduce({}) do |memo, model|
111+
classifiers_for_target.reduce({}) do |memo, model|
61112
memo[model.model_name] = request_with(to_classify, model)
62113
memo
63114
end
@@ -69,6 +120,10 @@ def classifiers
69120
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values
70121
end
71122

123+
def has_classifiers?
124+
classifiers.present?
125+
end
126+
72127
private
73128

74129
def prepare_text(target)

lib/sentiment/sentiment_site_setting_json_schema.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def self.schema
2727
end
2828

2929
def self.values
30+
return {} if SiteSetting.ai_sentiment_model_configs.blank?
3031
JSON.parse(SiteSetting.ai_sentiment_model_configs, object_class: OpenStruct)
3132
end
3233
end

lib/tasks/modules/sentiment/backfill.rake

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,8 @@
22

33
desc "Backfill sentiment for all posts"
44
task "ai:sentiment:backfill", [:start_post] => [:environment] do |_, args|
5-
public_categories = Category.where(read_restricted: false).pluck(:id)
6-
7-
Post
8-
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
9-
.joins(
10-
"LEFT JOIN classification_results ON classification_results.target_id = posts.id AND classification_results.target_type = 'Post'",
11-
)
12-
.where("classification_results.target_id IS NULL")
13-
.where("posts.id >= ?", args[:start_post].to_i || 0)
14-
.where("category_id IN (?)", public_categories)
15-
.where(posts: { deleted_at: nil })
16-
.where(topics: { deleted_at: nil })
5+
DiscourseAi::Sentiment::PostClassification
6+
.backfill_query(from_post_id: args[:start_post].to_i)
177
.find_in_batches do |batch|
188
print "."
199
DiscourseAi::Sentiment::PostClassification.new.bulk_classify!(batch)

plugin.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def self.public_asset_path(name)
8989
reloadable_patch do |plugin|
9090
Guardian.prepend DiscourseAi::GuardianExtensions
9191
Topic.prepend DiscourseAi::TopicExtensions
92+
Post.prepend DiscourseAi::PostExtensions
9293
end
9394

9495
register_modifier(:post_should_secure_uploads?) do |_, _, topic|
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# frozen_string_literal: true
2+
3+
require_relative "../../support/sentiment_inference_stubs"
4+
5+
RSpec.describe Jobs::SentimentBackfill do
6+
describe "#execute" do
7+
fab!(:post)
8+
9+
before do
10+
SiteSetting.ai_sentiment_enabled = true
11+
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 100
12+
SiteSetting.ai_sentiment_model_configs =
13+
"[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
14+
end
15+
16+
let(:expected_analysis) { DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.length }
17+
18+
it "backfills when settings are correct" do
19+
SentimentInferenceStubs.stub_classification(post)
20+
subject.execute({})
21+
22+
expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
23+
end
24+
25+
it "does nothing when batch size is zero" do
26+
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 0
27+
28+
subject.execute({})
29+
30+
expect(ClassificationResult.count).to be_zero
31+
end
32+
33+
it "does nothing when sentiment is disabled" do
34+
SiteSetting.ai_sentiment_enabled = false
35+
36+
subject.execute({})
37+
38+
expect(ClassificationResult.count).to be_zero
39+
end
40+
41+
it "respects the ai_sentiment_backfill_post_max_age_days setting" do
42+
SentimentInferenceStubs.stub_classification(post)
43+
SiteSetting.ai_sentiment_backfill_post_max_age_days = 80
44+
post_2 = Fabricate(:post, created_at: 81.days.ago)
45+
46+
subject.execute({})
47+
48+
expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
49+
expect(ClassificationResult.where(target: post_2).count).to be_zero
50+
end
51+
end
52+
end

0 commit comments

Comments
 (0)