discourse · xfalcox · Mar 25, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
diff --git a/app/services/discourse_translator/discourse_ai.rb b/app/services/discourse_translator/discourse_ai.rb
@@ -1,8 +1,5 @@
 # frozen_string_literal: true
 
-require_relative "base"
-require "json"
-
 module DiscourseTranslator
   class DiscourseAi < Base
     MAX_DETECT_LOCALE_TEXT_LENGTH = 1000
@@ -38,10 +35,11 @@ def self.translate!(translatable, target_locale_sym = I18n.locale)
       translated =
         case translatable.class.name
         when "Post"
-          ::DiscourseAi::PostTranslator.new(
-            text_for_translation(translatable, raw: true),
-            language,
-          ).translate
+          text = text_for_translation(translatable, raw: true)
+          chunks = DiscourseTranslator::ContentSplitter.split(text)
+          chunks
+            .map { |chunk| ::DiscourseAi::PostTranslator.new(chunk, target_locale_sym).translate }
+            .join("")
         when "Topic"
           ::DiscourseAi::TopicTranslator.new(text_for_translation(translatable), language).translate
         end

diff --git a/lib/discourse_translator/content_splitter.rb b/lib/discourse_translator/content_splitter.rb
@@ -0,0 +1,107 @@
+# frozen_string_literal: true
+
+module DiscourseTranslator
+  class ContentSplitter
+    CHUNK_SIZE = 3000
+
+    BBCODE_PATTERNS = [
+      %r{\[table.*?\].*?\[/table\]}m,
+      %r{\[quote.*?\].*?\[/quote\]}m,
+      %r{\[details.*?\].*?\[/details\]}m,
+      %r{\<details.*?\>.*?\</details\>}m,
+      %r{\[spoiler.*?\].*?\[/spoiler\]}m,
+      %r{\[code.*?\].*?\[/code\]}m,
+      /```.*?```/m,
+    ].freeze
+
+    TEXT_BOUNDARIES = [
+      /\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
+      /[.!?]\s+/, # sentence endings
+      /[,;]\s+/, # clause endings
+      /\n|\r\n/, # single newlines
+      /\s+/, # any whitespace
+    ].freeze
+
+    def self.split(content)
+      return [] if content.nil?
+      return [""] if content.empty?
+      return [content] if content.length <= CHUNK_SIZE
+
+      chunks = []
+      remaining = content.dup
+
+      while remaining.present?
+        chunk = extract_mixed_chunk(remaining)
+        break if chunk.empty?
+        chunks << chunk
+        remaining = remaining[chunk.length..-1]
+      end
+
+      chunks
+    end
+
+    private
+
+    def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
+      return text if text.length <= size
+      flexible_size = size * 1.5
+
+      # try each splitting strategy in order
+      split_point =
+        [
+          -> { find_nearest_html_end_index(text, size) },
+          -> { find_nearest_bbcode_end_index(text, size) },
+          -> { find_text_boundary(text, size) },
+          -> { size },
+        ].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
+
+      text[0...split_point]
+    end
+
+    def self.find_nearest_html_end_index(text, target_pos)
+      return nil if !text.include?("<")
+
+      begin
+        doc = Nokogiri::HTML5.fragment(text)
+        current_length = 0
+
+        doc.children.each do |node|
+          html = node.to_html
+          end_pos = current_length + html.length
+          return end_pos if end_pos > target_pos
+          current_length = end_pos
+        end
+        nil
+      rescue Nokogiri::SyntaxError
+        nil
+      end
+    end
+
+    def self.find_nearest_bbcode_end_index(text, target_pos)
+      BBCODE_PATTERNS.each do |pattern|
+        text.scan(pattern) do |_|
+          match = $~
+          tag_start = match.begin(0)
+          tag_end = match.end(0)
+
+          return tag_end if tag_start <= target_pos && tag_end > target_pos
+        end
+      end
+
+      nil
+    end
+
+    def self.find_text_boundary(text, target_pos)
+      search_text = text
+
+      TEXT_BOUNDARIES.each do |pattern|
+        if pos = search_text.rindex(pattern, target_pos)
+          # Include all trailing whitespace
+          pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
+          return pos
+        end
+      end
+      nil
+    end
+  end
+end
diff --git a/spec/lib/content_splitter_spec.rb b/spec/lib/content_splitter_spec.rb
@@ -0,0 +1,99 @@
+# frozen_string_literal: true
+
+require "rails_helper"
+
+describe DiscourseTranslator::ContentSplitter do
+  let(:original_limit) { 4000 }
+
+  after { described_class.const_set(:CHUNK_SIZE, original_limit) }
+
+  def set_limit(value)
+    described_class.const_set(:CHUNK_SIZE, value)
+  end
+
+  it "returns empty array for empty input" do
+    expect(described_class.split("")).to eq([""])
+  end
+
+  it "handles content with only spaces" do
+    expect(described_class.split(" ")).to eq([" "])
+    expect(described_class.split("  ")).to eq(["  "])
+  end
+
+  it "handles nil input" do
+    expect(described_class.split(nil)).to eq([])
+  end
+
+  it "doesn't split content under limit" do
+    text = "hello world"
+    expect(described_class.split(text)).to eq([text])
+  end
+
+  it "preserves HTML tags" do
+    set_limit(10)
+    text = "<p>hello</p><p>meow</p>"
+    expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
+
+    set_limit(35)
+    text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
+    expect(described_class.split(text)).to eq(
+      ["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
+    )
+  end
+
+  it "preserves BBCode tags" do
+    set_limit(20)
+    text = "[quote]hello[/quote][details]world[/details]"
+    expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
+  end
+
+  it "doesn't split in middle of words" do
+    set_limit(10)
+    text = "my kitty best in the world"
+    expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
+  end
+
+  it "handles nested tags properly" do
+    set_limit(25)
+    text = "<div>hello<p>cat</p>world</div><p>meow</p>"
+    expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
+  end
+
+  it "handles mixed HTML and BBCode" do
+    set_limit(15)
+    text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
+    expect(described_class.split(text)).to eq(
+      ["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
+    )
+  end
+
+  it "preserves newlines in sensible places" do
+    set_limit(10)
+    text = "hello\nbeautiful\nworld\n"
+    expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
+  end
+
+  it "handles email content properly" do
+    set_limit(20)
+    text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
+    expect(described_class.split(text)).to eq(
+      ["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"],
+    )
+  end
+
+  it "keeps code blocks intact" do
+    set_limit(30)
+    text = "Text\n```\ncode block\nhere\n```\nmore text"
+    expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
+  end
+
+  context "with multiple details tags" do
+    it "splits correctly between details tags" do
+      set_limit(30)
+      text = "<details>first content</details><details>second content</details>"
+      expect(described_class.split(text)).to eq(
+        ["<details>first content</details>", "<details>second content</details>"],
+      )
+    end
+  end
+end
diff --git a/spec/services/discourse_ai_spec.rb b/spec/services/discourse_ai_spec.rb
@@ -60,6 +60,13 @@
         expect(translated_text).to eq "some translated text"
       end
     end
+
+    it "sends the content for splitting and the split content for translation" do
+      post.update(raw: "#{"a" * 3000} #{"b" * 3000}")
+      DiscourseAi::Completions::Llm.with_prepared_responses(
+        %w[lol wut].map { |content| translation_json(content) },
+      ) { expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "<p>lolwut</p>" }
+    end
   end
 
   def locale_json(content)