diff --git a/app/services/discourse_translator/discourse_ai.rb b/app/services/discourse_translator/discourse_ai.rb index bad06938..947a3cb3 100644 --- a/app/services/discourse_translator/discourse_ai.rb +++ b/app/services/discourse_translator/discourse_ai.rb @@ -1,8 +1,5 @@ # frozen_string_literal: true -require_relative "base" -require "json" - module DiscourseTranslator class DiscourseAi < Base MAX_DETECT_LOCALE_TEXT_LENGTH = 1000 @@ -38,10 +35,11 @@ def self.translate!(translatable, target_locale_sym = I18n.locale) translated = case translatable.class.name when "Post" - ::DiscourseAi::PostTranslator.new( - text_for_translation(translatable, raw: true), - language, - ).translate + text = text_for_translation(translatable, raw: true) + chunks = DiscourseTranslator::ContentSplitter.split(text) + chunks + .map { |chunk| ::DiscourseAi::PostTranslator.new(chunk, target_locale_sym).translate } + .join("") when "Topic" ::DiscourseAi::TopicTranslator.new(text_for_translation(translatable), language).translate end diff --git a/lib/discourse_translator/content_splitter.rb b/lib/discourse_translator/content_splitter.rb new file mode 100644 index 00000000..af45afb9 --- /dev/null +++ b/lib/discourse_translator/content_splitter.rb @@ -0,0 +1,107 @@ +# frozen_string_literal: true + +module DiscourseTranslator + class ContentSplitter + CHUNK_SIZE = 3000 + + BBCODE_PATTERNS = [ + %r{\[table.*?\].*?\[/table\]}m, + %r{\[quote.*?\].*?\[/quote\]}m, + %r{\[details.*?\].*?\[/details\]}m, + %r{\.*?\}m, + %r{\[spoiler.*?\].*?\[/spoiler\]}m, + %r{\[code.*?\].*?\[/code\]}m, + /```.*?```/m, + ].freeze + + TEXT_BOUNDARIES = [ + /\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces + /[.!?]\s+/, # sentence endings + /[,;]\s+/, # clause endings + /\n|\r\n/, # single newlines + /\s+/, # any whitespace + ].freeze + + def self.split(content) + return [] if content.nil? + return [""] if content.empty? + return [content] if content.length <= CHUNK_SIZE + + chunks = [] + remaining = content.dup + + while remaining.present? + chunk = extract_mixed_chunk(remaining) + break if chunk.empty? + chunks << chunk + remaining = remaining[chunk.length..-1] + end + + chunks + end + + private + + def self.extract_mixed_chunk(text, size: CHUNK_SIZE) + return text if text.length <= size + flexible_size = size * 1.5 + + # try each splitting strategy in order + split_point = + [ + -> { find_nearest_html_end_index(text, size) }, + -> { find_nearest_bbcode_end_index(text, size) }, + -> { find_text_boundary(text, size) }, + -> { size }, + ].lazy.map(&:call).compact.find { |pos| pos <= flexible_size } + + text[0...split_point] + end + + def self.find_nearest_html_end_index(text, target_pos) + return nil if !text.include?("<") + + begin + doc = Nokogiri::HTML5.fragment(text) + current_length = 0 + + doc.children.each do |node| + html = node.to_html + end_pos = current_length + html.length + return end_pos if end_pos > target_pos + current_length = end_pos + end + nil + rescue Nokogiri::SyntaxError + nil + end + end + + def self.find_nearest_bbcode_end_index(text, target_pos) + BBCODE_PATTERNS.each do |pattern| + text.scan(pattern) do |_| + match = $~ + tag_start = match.begin(0) + tag_end = match.end(0) + + return tag_end if tag_start <= target_pos && tag_end > target_pos + end + end + + nil + end + + def self.find_text_boundary(text, target_pos) + search_text = text + + TEXT_BOUNDARIES.each do |pattern| + if pos = search_text.rindex(pattern, target_pos) + # Include all trailing whitespace + pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/) + return pos + end + end + nil + end + end +end diff --git a/spec/lib/content_splitter_spec.rb b/spec/lib/content_splitter_spec.rb new file mode 100644 index 00000000..3d1d9f2f --- /dev/null +++ b/spec/lib/content_splitter_spec.rb @@ -0,0 +1,99 @@ +# frozen_string_literal: true + +require "rails_helper" + +describe DiscourseTranslator::ContentSplitter do + let(:original_limit) { 4000 } + + after { described_class.const_set(:CHUNK_SIZE, original_limit) } + + def set_limit(value) + described_class.const_set(:CHUNK_SIZE, value) + end + + it "returns empty array for empty input" do + expect(described_class.split("")).to eq([""]) + end + + it "handles content with only spaces" do + expect(described_class.split(" ")).to eq([" "]) + expect(described_class.split(" ")).to eq([" "]) + end + + it "handles nil input" do + expect(described_class.split(nil)).to eq([]) + end + + it "doesn't split content under limit" do + text = "hello world" + expect(described_class.split(text)).to eq([text]) + end + + it "preserves HTML tags" do + set_limit(10) + text = "

hello

meow

" + expect(described_class.split(text)).to eq(%w[

hello

meow

]) + + set_limit(35) + text = "
hello
jurassic

world

" + expect(described_class.split(text)).to eq( + ["
hello
jurassic
", "

world

"], + ) + end + + it "preserves BBCode tags" do + set_limit(20) + text = "[quote]hello[/quote][details]world[/details]" + expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"]) + end + + it "doesn't split in middle of words" do + set_limit(10) + text = "my kitty best in the world" + expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"]) + end + + it "handles nested tags properly" do + set_limit(25) + text = "
hello

cat

world

meow

" + expect(described_class.split(text)).to eq(%w[
hello

cat

world

meow

]) + end + + it "handles mixed HTML and BBCode" do + set_limit(15) + text = "
hello
[quote]world[/quote]

beautiful

" + expect(described_class.split(text)).to eq( + ["
hello
", "[quote]world[/quote]", "

beautiful

"], + ) + end + + it "preserves newlines in sensible places" do + set_limit(10) + text = "hello\nbeautiful\nworld\n" + expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"]) + end + + it "handles email content properly" do + set_limit(20) + text = "From: test@test.com\nTo: other@test.com\nSubject: Hello\n\nContent here" + expect(described_class.split(text)).to eq( + ["From: test@test.com\n", "To: other@test.com\n", "Subject: Hello\n\n", "Content here"], + ) + end + + it "keeps code blocks intact" do + set_limit(30) + text = "Text\n```\ncode block\nhere\n```\nmore text" + expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"]) + end + + context "with multiple details tags" do + it "splits correctly between details tags" do + set_limit(30) + text = "
first content
second content
" + expect(described_class.split(text)).to eq( + ["
first content
", "
second content
"], + ) + end + end +end diff --git a/spec/services/discourse_ai_spec.rb b/spec/services/discourse_ai_spec.rb index 973b22fe..5a8c14bb 100644 --- a/spec/services/discourse_ai_spec.rb +++ b/spec/services/discourse_ai_spec.rb @@ -60,6 +60,13 @@ expect(translated_text).to eq "some translated text" end end + + it "sends the content for splitting and the split content for translation" do + post.update(raw: "#{"a" * 3000} #{"b" * 3000}") + DiscourseAi::Completions::Llm.with_prepared_responses( + %w[lol wut].map { |content| translation_json(content) }, + ) { expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "

lolwut

" } + end end def locale_json(content)