Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions app/jobs/scheduled/sentiment_backfill.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

module Jobs
class SentimentBackfill < ::Jobs::Scheduled
every 5.minutes
cluster_concurrency 1

def execute(_args)
return if !SiteSetting.ai_sentiment_enabled

base_budget = SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour
return if base_budget.zero?
# Split budget in 12 intervals, but make sure is at least one.
#
# This is not exact as we don't have a way of tracking how many
# posts we classified in the current hour, but it's a good enough approximation.
limit_per_job = [base_budget, 12].max / 12

classificator = DiscourseAi::Sentiment::PostClassification.new
return if !classificator.has_classifiers?

posts =
DiscourseAi::Sentiment::PostClassification.backfill_query(
max_age_days: SiteSetting.ai_sentiment_backfill_post_max_age_days,
).limit(limit_per_job)

classificator.bulk_classify!(posts)
end
end
end
9 changes: 9 additions & 0 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ discourse_ai:
ai_sentiment_model_configs:
default: ""
json_schema: DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
ai_sentiment_backfill_maximum_posts_per_hour:
default: 250
min: 0
max: 10000
hidden: true
ai_sentiment_backfill_post_max_age_days:
default: 60
hidden: true


ai_openai_dall_e_3_url: "https://api.openai.com/v1/images/generations"
ai_openai_embeddings_url: "https://api.openai.com/v1/embeddings"
Expand Down
16 changes: 16 additions & 0 deletions lib/post_extensions.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# frozen_string_literal: true

module DiscourseAi
module PostExtensions
extend ActiveSupport::Concern

prepended do
has_many :classification_results, as: :target

has_many :sentiment_classifications,
-> { where(classification_type: "sentiment") },
class_name: "ClassificationResult",
as: :target
end
end
end
59 changes: 57 additions & 2 deletions lib/sentiment/post_classification.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,47 @@
module DiscourseAi
module Sentiment
class PostClassification
def self.backfill_query(from_post_id: nil, max_age_days: nil)
available_classifier_names =
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema
.values
.map { |mc| mc.model_name.downcase }
.sort

base_query =
Post
.includes(:sentiment_classifications)
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
.where(post_type: Post.types[:regular])
.where.not(topics: { archetype: Archetype.private_message })
.where(posts: { deleted_at: nil })
.where(topics: { deleted_at: nil })
.joins(<<~SQL)
LEFT JOIN classification_results crs
ON crs.target_id = posts.id
AND crs.target_type = 'Post'
AND crs.classification_type = 'sentiment'
SQL
.group("posts.id")
.having(<<~SQL, available_classifier_names)
COUNT(crs.model_used) = 0
OR array_agg(
DISTINCT LOWER(crs.model_used) ORDER BY LOWER(crs.model_used)
)::text[] IS DISTINCT FROM array[?]
SQL

base_query = base_query.where("posts.id >= ?", from_post_id.to_i) if from_post_id.present?

if max_age_days.present?
base_query =
base_query.where(
"posts.created_at > current_date - INTERVAL '#{max_age_days.to_i} DAY'",
)
end

base_query
end

def bulk_classify!(relation)
http_pool_size = 100
pool =
Expand All @@ -13,6 +54,7 @@ def bulk_classify!(relation)
)

available_classifiers = classifiers
return if available_classifiers.blank?
base_url = Discourse.base_url

promised_classifications =
Expand All @@ -25,9 +67,13 @@ def bulk_classify!(relation)
.fulfilled_future({ target: record, text: text }, pool)
.then_on(pool) do |w_text|
results = Concurrent::Hash.new
already_classified = w_text[:target].sentiment_classifications.map(&:model_used)

classifiers_for_target =
available_classifiers.reject { |ac| already_classified.include?(ac.model_name) }

promised_target_results =
available_classifiers.map do |c|
classifiers_for_target.map do |c|
Concurrent::Promises.future_on(pool) do
results[c.model_name] = request_with(w_text[:text], c, base_url)
end
Expand All @@ -52,12 +98,17 @@ def bulk_classify!(relation)

def classify!(target)
return if target.blank?
return if classifiers.blank?

to_classify = prepare_text(target)
return if to_classify.blank?

already_classified = target.sentiment_classifications.map(&:model_used)
classifiers_for_target =
classifiers.reject { |ac| already_classified.include?(ac.model_name) }

results =
classifiers.reduce({}) do |memo, model|
classifiers_for_target.reduce({}) do |memo, model|
memo[model.model_name] = request_with(to_classify, model)
memo
end
Expand All @@ -69,6 +120,10 @@ def classifiers
DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values
end

def has_classifiers?
classifiers.present?
end

private

def prepare_text(target)
Expand Down
1 change: 1 addition & 0 deletions lib/sentiment/sentiment_site_setting_json_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def self.schema
end

def self.values
return {} if SiteSetting.ai_sentiment_model_configs.blank?
JSON.parse(SiteSetting.ai_sentiment_model_configs, object_class: OpenStruct)
end
end
Expand Down
14 changes: 2 additions & 12 deletions lib/tasks/modules/sentiment/backfill.rake
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,8 @@

desc "Backfill sentiment for all posts"
task "ai:sentiment:backfill", [:start_post] => [:environment] do |_, args|
public_categories = Category.where(read_restricted: false).pluck(:id)

Post
.joins("INNER JOIN topics ON topics.id = posts.topic_id")
.joins(
"LEFT JOIN classification_results ON classification_results.target_id = posts.id AND classification_results.target_type = 'Post'",
)
.where("classification_results.target_id IS NULL")
.where("posts.id >= ?", args[:start_post].to_i || 0)
.where("category_id IN (?)", public_categories)
.where(posts: { deleted_at: nil })
.where(topics: { deleted_at: nil })
DiscourseAi::Sentiment::PostClassification
.backfill_query(from_post_id: args[:start_post].to_i)
.find_in_batches do |batch|
print "."
DiscourseAi::Sentiment::PostClassification.new.bulk_classify!(batch)
Expand Down
1 change: 1 addition & 0 deletions plugin.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def self.public_asset_path(name)
reloadable_patch do |plugin|
Guardian.prepend DiscourseAi::GuardianExtensions
Topic.prepend DiscourseAi::TopicExtensions
Post.prepend DiscourseAi::PostExtensions
end

register_modifier(:post_should_secure_uploads?) do |_, _, topic|
Expand Down
52 changes: 52 additions & 0 deletions spec/jobs/scheduled/sentiment_backfill_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# frozen_string_literal: true

require_relative "../../support/sentiment_inference_stubs"

RSpec.describe Jobs::SentimentBackfill do
describe "#execute" do
fab!(:post)

before do
SiteSetting.ai_sentiment_enabled = true
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 100
SiteSetting.ai_sentiment_model_configs =
"[{\"model_name\":\"SamLowe/roberta-base-go_emotions\",\"endpoint\":\"http://samlowe-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"j-hartmann/emotion-english-distilroberta-base\",\"endpoint\":\"http://jhartmann-emotion.com\",\"api_key\":\"123\"},{\"model_name\":\"cardiffnlp/twitter-roberta-base-sentiment-latest\",\"endpoint\":\"http://cardiffnlp-sentiment.com\",\"api_key\":\"123\"}]"
end

let(:expected_analysis) { DiscourseAi::Sentiment::SentimentSiteSettingJsonSchema.values.length }

it "backfills when settings are correct" do
SentimentInferenceStubs.stub_classification(post)
subject.execute({})

expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
end

it "does nothing when batch size is zero" do
SiteSetting.ai_sentiment_backfill_maximum_posts_per_hour = 0

subject.execute({})

expect(ClassificationResult.count).to be_zero
end

it "does nothing when sentiment is disabled" do
SiteSetting.ai_sentiment_enabled = false

subject.execute({})

expect(ClassificationResult.count).to be_zero
end

it "respects the ai_sentiment_backfill_post_max_age_days setting" do
SentimentInferenceStubs.stub_classification(post)
SiteSetting.ai_sentiment_backfill_post_max_age_days = 80
post_2 = Fabricate(:post, created_at: 81.days.ago)

subject.execute({})

expect(ClassificationResult.where(target: post).count).to eq(expected_analysis)
expect(ClassificationResult.where(target: post_2).count).to be_zero
end
end
end
Loading
Loading