Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions app/services/discourse_translator/discourse_ai.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# frozen_string_literal: true

require_relative "base"
require "json"

module DiscourseTranslator
class DiscourseAi < Base
MAX_DETECT_LOCALE_TEXT_LENGTH = 1000
Expand Down Expand Up @@ -38,10 +35,11 @@ def self.translate!(translatable, target_locale_sym = I18n.locale)
translated =
case translatable.class.name
when "Post"
::DiscourseAi::PostTranslator.new(
text_for_translation(translatable, raw: true),
language,
).translate
text = text_for_translation(translatable, raw: true)
chunks = DiscourseTranslator::ContentSplitter.split(text)
chunks
.map { |chunk| ::DiscourseAi::PostTranslator.new(chunk, target_locale_sym).translate }
.join("")
when "Topic"
::DiscourseAi::TopicTranslator.new(text_for_translation(translatable), language).translate
end
Expand Down
107 changes: 107 additions & 0 deletions lib/discourse_translator/content_splitter.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# frozen_string_literal: true

module DiscourseTranslator
class ContentSplitter
CHUNK_SIZE = 3000

BBCODE_PATTERNS = [
%r{\[table.*?\].*?\[/table\]}m,
%r{\[quote.*?\].*?\[/quote\]}m,
%r{\[details.*?\].*?\[/details\]}m,
%r{\<details.*?\>.*?\</details\>}m,
%r{\[spoiler.*?\].*?\[/spoiler\]}m,
%r{\[code.*?\].*?\[/code\]}m,
/```.*?```/m,
].freeze

TEXT_BOUNDARIES = [
/\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
/[.!?]\s+/, # sentence endings
/[,;]\s+/, # clause endings
/\n|\r\n/, # single newlines
/\s+/, # any whitespace
].freeze

def self.split(content)
return [] if content.nil?
return [""] if content.empty?
return [content] if content.length <= CHUNK_SIZE

chunks = []
remaining = content.dup

while remaining.present?
chunk = extract_mixed_chunk(remaining)
break if chunk.empty?
chunks << chunk
remaining = remaining[chunk.length..-1]
end

chunks
end

private

def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
return text if text.length <= size
flexible_size = size * 1.5

# try each splitting strategy in order
split_point =
[
-> { find_nearest_html_end_index(text, size) },
-> { find_nearest_bbcode_end_index(text, size) },
-> { find_text_boundary(text, size) },
-> { size },
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }

text[0...split_point]
end

def self.find_nearest_html_end_index(text, target_pos)
return nil if !text.include?("<")

begin
doc = Nokogiri::HTML5.fragment(text)
current_length = 0

doc.children.each do |node|
html = node.to_html
end_pos = current_length + html.length
return end_pos if end_pos > target_pos
current_length = end_pos
end
nil
rescue Nokogiri::SyntaxError
nil
end
end

def self.find_nearest_bbcode_end_index(text, target_pos)
BBCODE_PATTERNS.each do |pattern|
text.scan(pattern) do |_|
match = $~
tag_start = match.begin(0)
tag_end = match.end(0)

return tag_end if tag_start <= target_pos && tag_end > target_pos
end
end

nil
end

def self.find_text_boundary(text, target_pos)
search_text = text

TEXT_BOUNDARIES.each do |pattern|
if pos = search_text.rindex(pattern, target_pos)
# Include all trailing whitespace
pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
return pos
end
end
nil
end
end
end
99 changes: 99 additions & 0 deletions spec/lib/content_splitter_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# frozen_string_literal: true

require "rails_helper"

describe DiscourseTranslator::ContentSplitter do
let(:original_limit) { 4000 }

after { described_class.const_set(:CHUNK_SIZE, original_limit) }

def set_limit(value)
described_class.const_set(:CHUNK_SIZE, value)
end

it "returns empty array for empty input" do
expect(described_class.split("")).to eq([""])
end

it "handles content with only spaces" do
expect(described_class.split(" ")).to eq([" "])
expect(described_class.split(" ")).to eq([" "])
end

it "handles nil input" do
expect(described_class.split(nil)).to eq([])
end

it "doesn't split content under limit" do
text = "hello world"
expect(described_class.split(text)).to eq([text])
end

it "preserves HTML tags" do
set_limit(10)
text = "<p>hello</p><p>meow</p>"
expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])

set_limit(35)
text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
expect(described_class.split(text)).to eq(
["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
)
end

it "preserves BBCode tags" do
set_limit(20)
text = "[quote]hello[/quote][details]world[/details]"
expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
end

it "doesn't split in middle of words" do
set_limit(10)
text = "my kitty best in the world"
expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
end

it "handles nested tags properly" do
set_limit(25)
text = "<div>hello<p>cat</p>world</div><p>meow</p>"
expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
end

it "handles mixed HTML and BBCode" do
set_limit(15)
text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
expect(described_class.split(text)).to eq(
["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
)
end

it "preserves newlines in sensible places" do
set_limit(10)
text = "hello\nbeautiful\nworld\n"
expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
end

it "handles email content properly" do
set_limit(20)
text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
expect(described_class.split(text)).to eq(
["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"],
)
end

it "keeps code blocks intact" do
set_limit(30)
text = "Text\n```\ncode block\nhere\n```\nmore text"
expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
end

context "with multiple details tags" do
it "splits correctly between details tags" do
set_limit(30)
text = "<details>first content</details><details>second content</details>"
expect(described_class.split(text)).to eq(
["<details>first content</details>", "<details>second content</details>"],
)
end
end
end
7 changes: 7 additions & 0 deletions spec/services/discourse_ai_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@
expect(translated_text).to eq "some translated text"
end
end

it "sends the content for splitting and the split content for translation" do
post.update(raw: "#{"a" * 3000} #{"b" * 3000}")
DiscourseAi::Completions::Llm.with_prepared_responses(
%w[lol wut].map { |content| translation_json(content) },
) { expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "<p>lolwut</p>" }
end
end

def locale_json(content)
Expand Down
Loading