Skip to content

Commit 23b83fb

Browse files
nattswxfalcox
andauthored
FIX: Split content for translation before sending (#249)
Co-authored-by: Rafael Silva <[email protected]>
1 parent 85eb33c commit 23b83fb

File tree

4 files changed

+218
-7
lines changed

4 files changed

+218
-7
lines changed

app/services/discourse_translator/discourse_ai.rb

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
# frozen_string_literal: true
22

3-
require_relative "base"
4-
require "json"
5-
63
module DiscourseTranslator
74
class DiscourseAi < Base
85
MAX_DETECT_LOCALE_TEXT_LENGTH = 1000
@@ -38,10 +35,11 @@ def self.translate!(translatable, target_locale_sym = I18n.locale)
3835
translated =
3936
case translatable.class.name
4037
when "Post"
41-
::DiscourseAi::PostTranslator.new(
42-
text_for_translation(translatable, raw: true),
43-
language,
44-
).translate
38+
text = text_for_translation(translatable, raw: true)
39+
chunks = DiscourseTranslator::ContentSplitter.split(text)
40+
chunks
41+
.map { |chunk| ::DiscourseAi::PostTranslator.new(chunk, target_locale_sym).translate }
42+
.join("")
4543
when "Topic"
4644
::DiscourseAi::TopicTranslator.new(text_for_translation(translatable), language).translate
4745
end
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseTranslator
4+
class ContentSplitter
5+
CHUNK_SIZE = 3000
6+
7+
BBCODE_PATTERNS = [
8+
%r{\[table.*?\].*?\[/table\]}m,
9+
%r{\[quote.*?\].*?\[/quote\]}m,
10+
%r{\[details.*?\].*?\[/details\]}m,
11+
%r{\<details.*?\>.*?\</details\>}m,
12+
%r{\[spoiler.*?\].*?\[/spoiler\]}m,
13+
%r{\[code.*?\].*?\[/code\]}m,
14+
/```.*?```/m,
15+
].freeze
16+
17+
TEXT_BOUNDARIES = [
18+
/\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
19+
/[.!?]\s+/, # sentence endings
20+
/[,;]\s+/, # clause endings
21+
/\n|\r\n/, # single newlines
22+
/\s+/, # any whitespace
23+
].freeze
24+
25+
def self.split(content)
26+
return [] if content.nil?
27+
return [""] if content.empty?
28+
return [content] if content.length <= CHUNK_SIZE
29+
30+
chunks = []
31+
remaining = content.dup
32+
33+
while remaining.present?
34+
chunk = extract_mixed_chunk(remaining)
35+
break if chunk.empty?
36+
chunks << chunk
37+
remaining = remaining[chunk.length..-1]
38+
end
39+
40+
chunks
41+
end
42+
43+
private
44+
45+
def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
46+
return text if text.length <= size
47+
flexible_size = size * 1.5
48+
49+
# try each splitting strategy in order
50+
split_point =
51+
[
52+
-> { find_nearest_html_end_index(text, size) },
53+
-> { find_nearest_bbcode_end_index(text, size) },
54+
-> { find_text_boundary(text, size) },
55+
-> { size },
56+
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
57+
58+
text[0...split_point]
59+
end
60+
61+
def self.find_nearest_html_end_index(text, target_pos)
62+
return nil if !text.include?("<")
63+
64+
begin
65+
doc = Nokogiri::HTML5.fragment(text)
66+
current_length = 0
67+
68+
doc.children.each do |node|
69+
html = node.to_html
70+
end_pos = current_length + html.length
71+
return end_pos if end_pos > target_pos
72+
current_length = end_pos
73+
end
74+
nil
75+
rescue Nokogiri::SyntaxError
76+
nil
77+
end
78+
end
79+
80+
def self.find_nearest_bbcode_end_index(text, target_pos)
81+
BBCODE_PATTERNS.each do |pattern|
82+
text.scan(pattern) do |_|
83+
match = $~
84+
tag_start = match.begin(0)
85+
tag_end = match.end(0)
86+
87+
return tag_end if tag_start <= target_pos && tag_end > target_pos
88+
end
89+
end
90+
91+
nil
92+
end
93+
94+
def self.find_text_boundary(text, target_pos)
95+
search_text = text
96+
97+
TEXT_BOUNDARIES.each do |pattern|
98+
if pos = search_text.rindex(pattern, target_pos)
99+
# Include all trailing whitespace
100+
pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
101+
return pos
102+
end
103+
end
104+
nil
105+
end
106+
end
107+
end

spec/lib/content_splitter_spec.rb

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# frozen_string_literal: true
2+
3+
require "rails_helper"
4+
5+
describe DiscourseTranslator::ContentSplitter do
6+
let(:original_limit) { 4000 }
7+
8+
after { described_class.const_set(:CHUNK_SIZE, original_limit) }
9+
10+
def set_limit(value)
11+
described_class.const_set(:CHUNK_SIZE, value)
12+
end
13+
14+
it "returns empty array for empty input" do
15+
expect(described_class.split("")).to eq([""])
16+
end
17+
18+
it "handles content with only spaces" do
19+
expect(described_class.split(" ")).to eq([" "])
20+
expect(described_class.split(" ")).to eq([" "])
21+
end
22+
23+
it "handles nil input" do
24+
expect(described_class.split(nil)).to eq([])
25+
end
26+
27+
it "doesn't split content under limit" do
28+
text = "hello world"
29+
expect(described_class.split(text)).to eq([text])
30+
end
31+
32+
it "preserves HTML tags" do
33+
set_limit(10)
34+
text = "<p>hello</p><p>meow</p>"
35+
expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
36+
37+
set_limit(35)
38+
text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
39+
expect(described_class.split(text)).to eq(
40+
["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
41+
)
42+
end
43+
44+
it "preserves BBCode tags" do
45+
set_limit(20)
46+
text = "[quote]hello[/quote][details]world[/details]"
47+
expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
48+
end
49+
50+
it "doesn't split in middle of words" do
51+
set_limit(10)
52+
text = "my kitty best in the world"
53+
expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
54+
end
55+
56+
it "handles nested tags properly" do
57+
set_limit(25)
58+
text = "<div>hello<p>cat</p>world</div><p>meow</p>"
59+
expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
60+
end
61+
62+
it "handles mixed HTML and BBCode" do
63+
set_limit(15)
64+
text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
65+
expect(described_class.split(text)).to eq(
66+
["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
67+
)
68+
end
69+
70+
it "preserves newlines in sensible places" do
71+
set_limit(10)
72+
text = "hello\nbeautiful\nworld\n"
73+
expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
74+
end
75+
76+
it "handles email content properly" do
77+
set_limit(20)
78+
text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
79+
expect(described_class.split(text)).to eq(
80+
["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"],
81+
)
82+
end
83+
84+
it "keeps code blocks intact" do
85+
set_limit(30)
86+
text = "Text\n```\ncode block\nhere\n```\nmore text"
87+
expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
88+
end
89+
90+
context "with multiple details tags" do
91+
it "splits correctly between details tags" do
92+
set_limit(30)
93+
text = "<details>first content</details><details>second content</details>"
94+
expect(described_class.split(text)).to eq(
95+
["<details>first content</details>", "<details>second content</details>"],
96+
)
97+
end
98+
end
99+
end

spec/services/discourse_ai_spec.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@
6060
expect(translated_text).to eq "some translated text"
6161
end
6262
end
63+
64+
it "sends the content for splitting and the split content for translation" do
65+
post.update(raw: "#{"a" * 3000} #{"b" * 3000}")
66+
DiscourseAi::Completions::Llm.with_prepared_responses(
67+
%w[lol wut].map { |content| translation_json(content) },
68+
) { expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "<p>lolwut</p>" }
69+
end
6370
end
6471

6572
def locale_json(content)

0 commit comments

Comments
 (0)