Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit e8f0633

Browse files
authored
DEV: Extend truncation to all summarizable content (#884)
1 parent e8eed71 commit e8f0633

File tree

5 files changed

+32
-25
lines changed

5 files changed

+32
-25
lines changed

lib/summarization/fold_content.rb

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ def initialize(llm, strategy, persist_summaries: true)
2525
def summarize(user, &on_partial_blk)
2626
base_summary = ""
2727
initial_pos = 0
28-
folded_summary =
29-
fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk)
28+
29+
truncated_content = content_to_summarize.map { |cts| truncate(cts) }
30+
31+
folded_summary = fold(truncated_content, base_summary, initial_pos, user, &on_partial_blk)
3032

3133
clean_summary =
3234
Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
@@ -37,7 +39,7 @@ def summarize(user, &on_partial_blk)
3739
strategy.type,
3840
llm_model.name,
3941
clean_summary,
40-
content_to_summarize.map { |c| c[:id] },
42+
truncated_content.map { |c| c[:id] },
4143
)
4244
else
4345
AiSummary.new(summarized_text: clean_summary)
@@ -121,9 +123,9 @@ def fold(items, summary, cursor, user, &on_partial_blk)
121123
prompt =
122124
(
123125
if summary.blank?
124-
strategy.first_summary_prompt(iteration_content, tokenizer)
126+
strategy.first_summary_prompt(iteration_content)
125127
else
126-
strategy.summary_extension_prompt(summary, iteration_content, tokenizer)
128+
strategy.summary_extension_prompt(summary, iteration_content)
127129
end
128130
)
129131

@@ -143,6 +145,22 @@ def available_tokens
143145

144146
llm_model.max_prompt_tokens - reserved_tokens
145147
end
148+
149+
def truncate(item)
150+
item_content = item[:text].to_s
151+
split_1, split_2 =
152+
[item_content[0, item_content.size / 2], item_content[(item_content.size / 2)..-1]]
153+
154+
truncation_length = 500
155+
tokenizer = llm_model.tokenizer_class
156+
157+
item[:text] = [
158+
tokenizer.truncate(split_1, truncation_length),
159+
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
160+
].join(" ")
161+
162+
item
163+
end
146164
end
147165
end
148166
end

lib/summarization/strategies/base.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,12 @@ def targets_data
3434
end
3535

3636
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
37-
def summary_extension_prompt(_summary, _texts_to_summarize, _tokenizer)
37+
def summary_extension_prompt(_summary, _texts_to_summarize)
3838
raise NotImplementedError
3939
end
4040

4141
# @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
42-
def first_summary_prompt(_input, _tokenizer)
42+
def first_summary_prompt(_input)
4343
raise NotImplementedError
4444
end
4545

lib/summarization/strategies/chat_messages.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def targets_data
2323
.map { { id: _1, poster: _2, text: _3 } }
2424
end
2525

26-
def summary_extension_prompt(summary, contents, _tokenizer)
26+
def summary_extension_prompt(summary, contents)
2727
input =
2828
contents
2929
.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
@@ -63,7 +63,7 @@ def summary_extension_prompt(summary, contents, _tokenizer)
6363
prompt
6464
end
6565

66-
def first_summary_prompt(contents, _tokenizer)
66+
def first_summary_prompt(contents)
6767
content_title = target.name
6868
input =
6969
contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join

lib/summarization/strategies/hot_topic_gists.rb

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def targets_data
5757
end
5858
end
5959

60-
def summary_extension_prompt(summary, contents, _tokenizer)
60+
def summary_extension_prompt(summary, contents)
6161
statements =
6262
contents
6363
.to_a
@@ -98,22 +98,11 @@ def summary_extension_prompt(summary, contents, _tokenizer)
9898
prompt
9999
end
100100

101-
def first_summary_prompt(contents, tokenizer)
101+
def first_summary_prompt(contents)
102102
content_title = target.title
103103
statements =
104104
contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
105105

106-
op_statement = statements.shift.to_s
107-
split_1, split_2 =
108-
[op_statement[0, op_statement.size / 2], op_statement[(op_statement.size / 2)..-1]]
109-
110-
truncation_length = 500
111-
112-
op_statement = [
113-
tokenizer.truncate(split_1, truncation_length),
114-
tokenizer.truncate(split_2.reverse, truncation_length).reverse,
115-
].join(" ")
116-
117106
prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
118107
You are an advanced summarization bot. Analyze a given conversation and produce a concise,
119108
single-sentence summary that conveys the main topic and current developments to someone with no prior context.
@@ -138,7 +127,7 @@ def first_summary_prompt(contents, tokenizer)
138127
139128
The conversation began with the following statement:
140129
141-
#{op_statement}\n
130+
#{statements.shift}\n
142131
TEXT
143132

144133
if statements.present?

lib/summarization/strategies/topic_summary.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def targets_data
2727
end
2828
end
2929

30-
def summary_extension_prompt(summary, contents, _tokenizer)
30+
def summary_extension_prompt(summary, contents)
3131
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
3232
content_title = target.title
3333
input =
@@ -70,7 +70,7 @@ def summary_extension_prompt(summary, contents, _tokenizer)
7070
prompt
7171
end
7272

73-
def first_summary_prompt(contents, _tokenizer)
73+
def first_summary_prompt(contents)
7474
resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
7575
content_title = target.title
7676
input =

0 commit comments

Comments
 (0)