DEV: Extend truncation to all summarizable content (#884)

romanrizzi · web-flow · commit e8f0633141e5 · 2024-10-31T12:17:42.000-03:00
diff --git a/lib/summarization/fold_content.rb b/lib/summarization/fold_content.rb
@@ -25,8 +25,10 @@ def initialize(llm, strategy, persist_summaries: true)
       def summarize(user, &on_partial_blk)
         base_summary = ""
         initial_pos = 0
-        folded_summary =
-          fold(content_to_summarize, base_summary, initial_pos, user, &on_partial_blk)
+
+        truncated_content = content_to_summarize.map { |cts| truncate(cts) }
+
+        folded_summary = fold(truncated_content, base_summary, initial_pos, user, &on_partial_blk)
 
         clean_summary =
           Nokogiri::HTML5.fragment(folded_summary).css("ai")&.first&.text || folded_summary
@@ -37,7 +39,7 @@ def summarize(user, &on_partial_blk)
             strategy.type,
             llm_model.name,
             clean_summary,
-            content_to_summarize.map { |c| c[:id] },
+            truncated_content.map { |c| c[:id] },
           )
         else
           AiSummary.new(summarized_text: clean_summary)
@@ -121,9 +123,9 @@ def fold(items, summary, cursor, user, &on_partial_blk)
         prompt =
           (
             if summary.blank?
-              strategy.first_summary_prompt(iteration_content, tokenizer)
+              strategy.first_summary_prompt(iteration_content)
             else
-              strategy.summary_extension_prompt(summary, iteration_content, tokenizer)
+              strategy.summary_extension_prompt(summary, iteration_content)
             end
           )
 
@@ -143,6 +145,22 @@ def available_tokens
 
         llm_model.max_prompt_tokens - reserved_tokens
       end
+
+      def truncate(item)
+        item_content = item[:text].to_s
+        split_1, split_2 =
+          [item_content[0, item_content.size / 2], item_content[(item_content.size / 2)..-1]]
+
+        truncation_length = 500
+        tokenizer = llm_model.tokenizer_class
+
+        item[:text] = [
+          tokenizer.truncate(split_1, truncation_length),
+          tokenizer.truncate(split_2.reverse, truncation_length).reverse,
+        ].join(" ")
+
+        item
+      end
     end
   end
 end
diff --git a/lib/summarization/strategies/base.rb b/lib/summarization/strategies/base.rb
@@ -34,12 +34,12 @@ def targets_data
         end
 
         # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM when extending an existing summary.
-        def summary_extension_prompt(_summary, _texts_to_summarize, _tokenizer)
+        def summary_extension_prompt(_summary, _texts_to_summarize)
           raise NotImplementedError
         end
 
         # @returns { DiscourseAi::Completions::Prompt } - Prompt passed to the LLM for summarizing a single chunk of content.
-        def first_summary_prompt(_input, _tokenizer)
+        def first_summary_prompt(_input)
           raise NotImplementedError
         end
 
diff --git a/lib/summarization/strategies/chat_messages.rb b/lib/summarization/strategies/chat_messages.rb
@@ -23,7 +23,7 @@ def targets_data
             .map { { id: _1, poster: _2, text: _3 } }
         end
 
-        def summary_extension_prompt(summary, contents, _tokenizer)
+        def summary_extension_prompt(summary, contents)
           input =
             contents
               .map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
@@ -63,7 +63,7 @@ def summary_extension_prompt(summary, contents, _tokenizer)
           prompt
         end
 
-        def first_summary_prompt(contents, _tokenizer)
+        def first_summary_prompt(contents)
           content_title = target.name
           input =
             contents.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }.join
diff --git a/lib/summarization/strategies/hot_topic_gists.rb b/lib/summarization/strategies/hot_topic_gists.rb
@@ -57,7 +57,7 @@ def targets_data
           end
         end
 
-        def summary_extension_prompt(summary, contents, _tokenizer)
+        def summary_extension_prompt(summary, contents)
           statements =
             contents
               .to_a
@@ -98,22 +98,11 @@ def summary_extension_prompt(summary, contents, _tokenizer)
           prompt
         end
 
-        def first_summary_prompt(contents, tokenizer)
+        def first_summary_prompt(contents)
           content_title = target.title
           statements =
             contents.to_a.map { |item| "(#{item[:id]} #{item[:poster]} said: #{item[:text]} " }
 
-          op_statement = statements.shift.to_s
-          split_1, split_2 =
-            [op_statement[0, op_statement.size / 2], op_statement[(op_statement.size / 2)..-1]]
-
-          truncation_length = 500
-
-          op_statement = [
-            tokenizer.truncate(split_1, truncation_length),
-            tokenizer.truncate(split_2.reverse, truncation_length).reverse,
-          ].join(" ")
-
           prompt = DiscourseAi::Completions::Prompt.new(<<~TEXT.strip)
             You are an advanced summarization bot. Analyze a given conversation and produce a concise,
             single-sentence summary that conveys the main topic and current developments to someone with no prior context.
@@ -138,7 +127,7 @@ def first_summary_prompt(contents, tokenizer)
             
             The conversation began with the following statement:
         
-            #{op_statement}\n
+            #{statements.shift}\n
           TEXT
 
           if statements.present?
diff --git a/lib/summarization/strategies/topic_summary.rb b/lib/summarization/strategies/topic_summary.rb
@@ -27,7 +27,7 @@ def targets_data
           end
         end
 
-        def summary_extension_prompt(summary, contents, _tokenizer)
+        def summary_extension_prompt(summary, contents)
           resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
           content_title = target.title
           input =
@@ -70,7 +70,7 @@ def summary_extension_prompt(summary, contents, _tokenizer)
           prompt
         end
 
-        def first_summary_prompt(contents, _tokenizer)
+        def first_summary_prompt(contents)
           resource_path = "#{Discourse.base_path}/t/-/#{target.id}"
           content_title = target.title
           input =