FIX: Split raw content to prevent job from timing out

nattsw · nattsw · commit 7efffc420703 · 2025-03-11T15:36:33.000+08:00
diff --git a/app/services/discourse_ai/translator.rb b/app/services/discourse_ai/translator.rb
@@ -4,13 +4,14 @@ module DiscourseAi
   class Translator
     PROMPT_TEMPLATE = <<~TEXT.freeze
       You are an expert translator specializing in converting Markdown content from any source language to target locale "%{target_language}". Your task is to:
-      1. Translate the content accurately while preserving all Markdown formatting elements
-      2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
-      3. Preserve all links, images, and other media references without translation
-      4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
-      5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
-      6. For ambiguous terms or phrases, choose the most contextually appropriate translation
-      7. You are being consumed via an API, only EVER return the translated text, do not return any other information
+      1. Accurately translate text only
+      2. Preserve all Markdown formatting elements, including incomplete ones
+      3. Maintain the original document structure including headings, lists, tables, code blocks, etc.
+      4. Preserve all links, images, and other media references without translation
+      5. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
+      6. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
+      7. For ambiguous terms or phrases, choose the most contextually appropriate translation
+      8. You are being consumed via an API, only EVER return the translated text, do not return any other information
     TEXT
 
     def initialize(text, target_language)
diff --git a/app/services/discourse_translator/discourse_ai.rb b/app/services/discourse_translator/discourse_ai.rb
@@ -1,8 +1,5 @@
 # frozen_string_literal: true
 
-require_relative "base"
-require "json"
-
 module DiscourseTranslator
   class DiscourseAi < Base
     MAX_DETECT_LOCALE_TEXT_LENGTH = 1000
@@ -33,7 +30,12 @@ def self.translate!(translatable, target_locale_sym = I18n.locale)
                 ),
               )
       end
-      ::DiscourseAi::Translator.new(text_for_translation(translatable), target_locale_sym).translate
+
+      text = text_for_translation(translatable)
+      chunks = DiscourseTranslator::ContentSplitter.split(text)
+      chunks
+        .map { |chunk| ::DiscourseAi::Translator.new(chunk, target_locale_sym).translate }
+        .join("")
     end
 
     private
diff --git a/lib/discourse_translator/content_splitter.rb b/lib/discourse_translator/content_splitter.rb
@@ -0,0 +1,107 @@
+# frozen_string_literal: true
+
+module DiscourseTranslator
+  class ContentSplitter
+    CHUNK_SIZE = 3000
+
+    BBCODE_PATTERNS = [
+      %r{\[table.*?\].*?\[/table\]}m,
+      %r{\[quote.*?\].*?\[/quote\]}m,
+      %r{\[details.*?\].*?\[/details\]}m,
+      %r{\<details.*?\>.*?\</details\>}m,
+      %r{\[spoiler.*?\].*?\[/spoiler\]}m,
+      %r{\[code.*?\].*?\[/code\]}m,
+      /```.*?```/m,
+    ].freeze
+
+    TEXT_BOUNDARIES = [
+      /\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
+      /[.!?]\s+/, # sentence endings
+      /[,;]\s+/, # clause endings
+      /\n|\r\n/, # single newlines
+      /\s+/, # any whitespace
+    ].freeze
+
+    def self.split(content)
+      return [] if content.nil?
+      return [""] if content.empty?
+      return [content] if content.length <= CHUNK_SIZE
+
+      chunks = []
+      remaining = content.dup
+
+      while remaining.present?
+        chunk = extract_mixed_chunk(remaining)
+        break if chunk.empty?
+        chunks << chunk
+        remaining = remaining[chunk.length..-1]
+      end
+
+      chunks
+    end
+
+    private
+
+    def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
+      return text if text.length <= size
+      flexible_size = size * 1.5
+
+      # try each splitting strategy in order
+      split_point =
+        [
+          -> { find_nearest_html_end_index(text, size) },
+          -> { find_nearest_bbcode_end_index(text, size) },
+          -> { find_text_boundary(text, size) },
+          -> { size },
+        ].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
+
+      text[0...split_point]
+    end
+
+    def self.find_nearest_html_end_index(text, target_pos)
+      return nil unless text.include?("<")
+
+      begin
+        doc = Nokogiri::HTML.fragment(text)
+        current_length = 0
+
+        doc.children.each do |node|
+          html = node.to_html
+          end_pos = current_length + html.length
+          return end_pos if end_pos > target_pos
+          current_length = end_pos
+        end
+        nil
+      rescue Nokogiri::SyntaxError
+        nil
+      end
+    end
+
+    def self.find_nearest_bbcode_end_index(text, target_pos)
+      BBCODE_PATTERNS.each do |pattern|
+        text.scan(pattern) do |_|
+          match = $~
+          tag_start = match.begin(0)
+          tag_end = match.end(0)
+
+          return tag_end if tag_start <= target_pos && tag_end > target_pos
+        end
+      end
+
+      nil
+    end
+
+    def self.find_text_boundary(text, target_pos)
+      search_text = text
+
+      TEXT_BOUNDARIES.each do |pattern|
+        if pos = search_text.rindex(pattern, target_pos)
+          # Include all trailing whitespace
+          pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
+          return pos
+        end
+      end
+      nil
+    end
+  end
+end
diff --git a/spec/lib/content_splitter_spec.rb b/spec/lib/content_splitter_spec.rb
@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+
+describe DiscourseTranslator::ContentSplitter do
+  let(:original_limit) { 4000 }
+
+  after { described_class.const_set(:CHUNK_SIZE, original_limit) }
+
+  def set_limit(value)
+    described_class.const_set(:CHUNK_SIZE, value)
+  end
+
+  it "returns empty array for empty input" do
+    expect(described_class.split("")).to eq([""])
+  end
+
+  it "handles nil input" do
+    expect(described_class.split(nil)).to eq([])
+  end
+
+  it "doesn't split content under limit" do
+    text = "hello world"
+    expect(described_class.split(text)).to eq([text])
+  end
+
+  it "preserves HTML tags" do
+    set_limit(10)
+    text = "<p>hello</p><p>meow</p>"
+    expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
+
+    set_limit(35)
+    text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
+    expect(described_class.split(text)).to eq(
+      ["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
+    )
+  end
+
+  it "preserves BBCode tags" do
+    set_limit(20)
+    text = "[quote]hello[/quote][details]world[/details]"
+    expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
+  end
+
+  it "doesn't split in middle of words" do
+    set_limit(10)
+    text = "my kitty best in the world"
+    expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
+  end
+
+  it "handles nested tags properly" do
+    set_limit(25)
+    text = "<div>hello<p>cat</p>world</div><p>meow</p>"
+    expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
+  end
+
+  it "handles mixed HTML and BBCode" do
+    set_limit(15)
+    text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
+    expect(described_class.split(text)).to eq(
+      ["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
+    )
+  end
+
+  it "preserves newlines in sensible places" do
+    set_limit(10)
+    text = "hello\nbeautiful\nworld\n"
+    expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
+  end
+
+  it "handles email content properly" do
+    set_limit(20)
+    text = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here"
+    expect(described_class.split(text)).to eq(
+      ["From: test@test.com\n", "To: other@test.com\n", "Subject: Hello\n\n", "Content here"],
+    )
+  end
+
+  it "keeps code blocks intact" do
+    set_limit(30)
+    text = "Text\n```\ncode block\nhere\n```\nmore text"
+    expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
+  end
+
+  context "with multiple details tags" do
+    it "splits correctly between details tags" do
+      set_limit(30)
+      text = "<details>first content</details><details>second content</details>"
+      expect(described_class.split(text)).to eq(
+        ["<details>first content</details>", "<details>second content</details>"],
+      )
+    end
+  end
+end
diff --git a/spec/services/discourse_ai/translator_spec.rb b/spec/services/discourse_ai/translator_spec.rb
@@ -18,13 +18,14 @@
       allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
         <<~TEXT,
       You are an expert translator specializing in converting Markdown content from any source language to target locale "de". Your task is to:
-      1. Translate the content accurately while preserving all Markdown formatting elements
-      2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
-      3. Preserve all links, images, and other media references without translation
-      4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
-      5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
-      6. For ambiguous terms or phrases, choose the most contextually appropriate translation
-      7. You are being consumed via an API, only EVER return the translated text, do not return any other information
+      1. Accurately translate text only
+      2. Preserve all Markdown formatting elements, including incomplete ones
+      3. Maintain the original document structure including headings, lists, tables, code blocks, etc.
+      4. Preserve all links, images, and other media references without translation
+      5. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
+      6. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
+      7. For ambiguous terms or phrases, choose the most contextually appropriate translation
+      8. You are being consumed via an API, only EVER return the translated text, do not return any other information
         TEXT
         messages: [{ type: :user, content: "cats are great", id: "user" }],
       ).and_call_original
diff --git a/spec/services/discourse_ai_spec.rb b/spec/services/discourse_ai_spec.rb
@@ -35,14 +35,19 @@
     end
   end
 
-  describe ".translate" do
+  describe ".translate!" do
     before { post.set_detected_locale("de") }
 
-    it "translates the post and returns [locale, translated_text]" do
+    it "returns the translated text from the llm" do
       DiscourseAi::Completions::Llm.with_prepared_responses(["some translated text"]) do
-        locale, translated_text = DiscourseTranslator::DiscourseAi.translate(post)
-        expect(locale).to eq "de"
-        expect(translated_text).to eq "some translated text"
+        expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "some translated text"
+      end
+    end
+
+    it "sends the content for splitting and the split content for translation" do
+      post.update(raw: "#{"a" * 3000} #{"b" * 3000}")
+      DiscourseAi::Completions::Llm.with_prepared_responses(%w[lol wut]) do
+        expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "lolwut"
       end
     end
   end