Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions app/jobs/scheduled/summaries_backfill.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,48 @@ def execute(_args)
return if !SiteSetting.ai_summarization_enabled
return if SiteSetting.ai_summary_backfill_maximum_topics_per_hour.zero?

# Split budget in 12 intervals, but make sure is at least one.
limit_per_job = [SiteSetting.ai_summary_backfill_maximum_topics_per_hour, 12].max / 12
budget = [current_budget, limit_per_job].min
system_user = Discourse.system_user

backfill_candidates
.limit(budget)
complete_t = AiSummary.summary_types[:complete]
backfill_candidates(complete_t)
.limit(current_budget(complete_t))
.each do |topic|
DiscourseAi::Summarization.topic_summary(topic).force_summarize(Discourse.system_user)
DiscourseAi::Summarization.topic_summary(topic).force_summarize(system_user)
end

gist_t = AiSummary.summary_types[:gist]
backfill_candidates(gist_t)
.limit(current_budget(gist_t))
.each { |topic| DiscourseAi::Summarization.topic_gist(topic).force_summarize(system_user) }
end

def backfill_candidates
def backfill_candidates(summary_type)
Topic
.where("topics.word_count >= ?", SiteSetting.ai_summary_backfill_minimum_word_count)
.joins(
"LEFT OUTER JOIN ai_summaries ais ON topics.id = ais.target_id AND ais.target_type = 'Topic'",
)
.joins(<<~SQL)
LEFT OUTER JOIN ai_summaries ais ON
topics.id = ais.target_id AND
ais.target_type = 'Topic' AND
ais.summary_type = '#{summary_type}'
SQL
.where(
"ais.id IS NULL OR UPPER(ais.content_range) < topics.highest_post_number + 1",
) # (1..1) gets stored ad (1..2).
.order("ais.created_at DESC NULLS FIRST, topics.last_posted_at DESC")
end

def current_budget
def current_budget(type)
# Split budget in 12 intervals, but make sure is at least one.
base_budget = SiteSetting.ai_summary_backfill_maximum_topics_per_hour
used_budget = AiSummary.complete.system.where("created_at > ?", 1.hour.ago).count
limit_per_job = [base_budget, 12].max / 12

used_budget =
AiSummary.system.where("created_at > ?", 1.hour.ago).where(summary_type: type).count

current_budget = [(base_budget - used_budget), limit_per_job].min
return 0 if current_budget < 0

base_budget - used_budget
current_budget
end
end
end
4 changes: 2 additions & 2 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,9 @@ discourse_ai:
type: list
list_type: compact
ai_summary_backfill_maximum_topics_per_hour:
default: 10
default: 0
min: 0
max: 1000
max: 10000
ai_summary_backfill_minimum_word_count:
default: 200
hidden: true
Expand Down
58 changes: 23 additions & 35 deletions spec/jobs/scheduled/summaries_backfill_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
RSpec.describe Jobs::SummariesBackfill do
fab!(:topic) { Fabricate(:topic, word_count: 200, highest_post_number: 2) }
let(:limit) { 24 } # guarantee two summaries per batch
let(:intervals) { 12 } # budget is split into intervals. Job runs every five minutes.

before do
assign_fake_provider_to(:ai_summarization_model)
Expand All @@ -11,65 +12,47 @@
end

describe "#current_budget" do
let(:type) { AiSummary.summary_types[:complete] }

context "when no summary has been backfilled yet" do
it "returns the full budget" do
expect(subject.current_budget).to eq(limit)
expect(subject.current_budget(type)).to eq(limit / intervals)
end

it "ignores summaries generated by users" do
Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:human])

expect(subject.current_budget).to eq(limit)
expect(subject.current_budget(type)).to eq(limit / intervals)
end

it "only accounts for complete type summaries" do
it "only accounts for summaries of the given type" do
Fabricate(:topic_ai_gist, target: topic, origin: AiSummary.origins[:human])

expect(subject.current_budget).to eq(limit)
end
end

context "when we already backfilled stuff" do
fab!(:backfilled_summary) do
Fabricate(:ai_summary, target: topic, origin: AiSummary.origins[:system])
end

context "if it was within the budget window" do
it "reduces our budget" do
expect(subject.current_budget).to eq(limit - 1)
end
end

context "if it wasn't within the budget window" do
before { freeze_time(2.hours.from_now) }

it "returns the full budget" do
freeze_time(2.hours.from_now)

expect(subject.current_budget).to eq(limit)
end
expect(subject.current_budget(type)).to eq(limit / intervals)
end
end
end

describe "#backfill_candidates" do
let(:type) { AiSummary.summary_types[:complete] }

it "only selects posts with enough words" do
topic.update!(word_count: 100)

expect(subject.backfill_candidates).to be_empty
expect(subject.backfill_candidates(type)).to be_empty
end

it "ignores up to date summaries" do
Fabricate(:ai_summary, target: topic, content_range: (1..2))

expect(subject.backfill_candidates).to be_empty
expect(subject.backfill_candidates(type)).to be_empty
end

it "orders candidates by topic#last_posted_at" do
topic.update!(last_posted_at: 1.minute.ago)
topic_2 = Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago)

expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic.id, topic_2.id)
expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic.id, topic_2.id)
end

it "prioritizes topics without summaries" do
Expand All @@ -78,7 +61,7 @@
topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, content_range: (1..1))

expect(subject.backfill_candidates.map(&:id)).to contain_exactly(topic_2.id, topic.id)
expect(subject.backfill_candidates(type).map(&:id)).to contain_exactly(topic_2.id, topic.id)
end
end

Expand All @@ -88,16 +71,21 @@
Fabricate(:topic, word_count: 200, last_posted_at: 2.minutes.ago, highest_post_number: 1)
topic.update!(last_posted_at: 1.minute.ago)
Fabricate(:ai_summary, target: topic, created_at: 3.hours.ago, content_range: (1..1))
Fabricate(:topic_ai_gist, target: topic, created_at: 3.hours.ago, content_range: (1..1))

summary_1 = "Summary of topic_2"
gist_1 = "Gist of topic_2"
summary_2 = "Summary of topic"
gist_2 = "Gist of topic"

DiscourseAi::Completions::Llm.with_prepared_responses([summary_1, summary_2]) do
subject.execute({})
end
DiscourseAi::Completions::Llm.with_prepared_responses(
[summary_1, summary_2, gist_1, gist_2],
) { subject.execute({}) }

expect(AiSummary.find_by(target: topic_2).summarized_text).to eq(summary_1)
expect(AiSummary.find_by(target: topic).summarized_text).to eq(summary_2)
expect(AiSummary.complete.find_by(target: topic_2).summarized_text).to eq(summary_1)
expect(AiSummary.gist.find_by(target: topic_2).summarized_text).to eq(gist_1)
expect(AiSummary.complete.find_by(target: topic).summarized_text).to eq(summary_2)
expect(AiSummary.gist.find_by(target: topic).summarized_text).to eq(gist_2)
end
end
end