Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 6f60315

Browse files
committed
move splitter to translator to use the llmmodel's max_output_tokens
1 parent e2d7ca0 commit 6f60315

File tree

9 files changed

+84
-86
lines changed

9 files changed

+84
-86
lines changed

app/jobs/regular/detect_translate_post.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def execute(args)
4949
# do nothing, there are too many sporadic lookup failures
5050
rescue => e
5151
DiscourseAi::Translation::VerboseLogger.log(
52-
"Failed to translate post #{post.id} to #{locale}: #{e.message}",
52+
"Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
5353
)
5454
end
5555
end

app/jobs/regular/detect_translate_topic.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def execute(args)
4747
# do nothing, there are too many sporadic lookup failures
4848
rescue => e
4949
DiscourseAi::Translation::VerboseLogger.log(
50-
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
50+
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
5151
)
5252
end
5353
end

app/jobs/regular/localize_categories.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def execute(args)
4040
# do nothing, there are too many sporadic lookup failures
4141
rescue => e
4242
DiscourseAi::Translation::VerboseLogger.log(
43-
"Failed to translate category #{category.id} to #{locale}: #{e.message}",
43+
"Failed to translate category #{category.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
4444
)
4545
ensure
4646
remaining_limit -= 1

app/jobs/regular/localize_posts.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def execute(args)
6565
# do nothing, there are too many sporadic lookup failures
6666
rescue => e
6767
DiscourseAi::Translation::VerboseLogger.log(
68-
"Failed to translate post #{post.id} to #{locale}: #{e.message}",
68+
"Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
6969
)
7070
end
7171
end

app/jobs/regular/localize_topics.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def execute(args)
6262
# do nothing, there are too many sporadic lookup failures
6363
rescue => e
6464
DiscourseAi::Translation::VerboseLogger.log(
65-
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
65+
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
6666
)
6767
end
6868
end

lib/translation/base_translator.rb

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,34 @@ def translate
1515
if (ai_persona = AiPersona.find_by(id: persona_setting)).blank?
1616
return nil
1717
end
18-
18+
translation_user = ai_persona.user || Discourse.system_user
1919
persona_klass = ai_persona.class_instance
2020
persona = persona_klass.new
2121

22-
llm_model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
23-
return nil if llm_model.blank?
22+
model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
23+
return nil if model.blank?
2424

25-
bot =
26-
DiscourseAi::Personas::Bot.as(
27-
ai_persona.user || Discourse.system_user,
28-
persona: persona,
29-
model: llm_model,
30-
)
25+
bot = DiscourseAi::Personas::Bot.as(translation_user, persona:, model:)
26+
27+
ContentSplitter
28+
.split(content: @text, chunk_size: model.max_output_tokens)
29+
.map { |text| get_translation(text:, bot:, translation_user:) }
30+
.join("")
31+
end
32+
33+
private
3134

35+
def formatted_content(content)
36+
{ content:, target_locale: @target_locale }.to_json
37+
end
38+
39+
def get_translation(text:, bot:, translation_user:)
3240
context =
3341
DiscourseAi::Personas::BotContext.new(
34-
user: ai_persona.user || Discourse.system_user,
42+
user: translation_user,
3543
skip_tool_details: true,
3644
feature_name: "translation",
37-
messages: [{ type: :user, content: formatted_content }],
45+
messages: [{ type: :user, content: formatted_content(text) }],
3846
topic: @topic,
3947
post: @post,
4048
)
@@ -47,12 +55,6 @@ def translate
4755
structured_output&.read_buffered_property(:translation)
4856
end
4957

50-
def formatted_content
51-
{ content: @text, target_locale: @target_locale }.to_json
52-
end
53-
54-
private
55-
5658
def persona_setting
5759
raise NotImplementedError
5860
end

lib/translation/content_splitter.rb

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
module DiscourseAi
44
module Translation
55
class ContentSplitter
6-
CHUNK_SIZE = 3000
6+
DEFAULT_CHUNK_SIZE = 8192
77

88
BBCODE_PATTERNS = [
99
%r{\[table.*?\].*?\[/table\]}m,
@@ -23,16 +23,16 @@ class ContentSplitter
2323
/\s+/, # any whitespace
2424
].freeze
2525

26-
def self.split(content)
26+
def self.split(content:, chunk_size: DEFAULT_CHUNK_SIZE)
2727
return [] if content.nil?
2828
return [""] if content.empty?
29-
return [content] if content.length <= CHUNK_SIZE
29+
return [content] if content.length <= chunk_size
3030

3131
chunks = []
3232
remaining = content.dup
3333

3434
while remaining.present?
35-
chunk = extract_mixed_chunk(remaining)
35+
chunk = extract_mixed_chunk(remaining, size: chunk_size)
3636
break if chunk.empty?
3737
chunks << chunk
3838
remaining = remaining[chunk.length..-1]
@@ -43,9 +43,8 @@ def self.split(content)
4343

4444
private
4545

46-
def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
46+
def self.extract_mixed_chunk(text, size:)
4747
return text if text.length <= size
48-
flexible_size = size * 1.5
4948

5049
# try each splitting strategy in order
5150
split_point =
@@ -54,7 +53,7 @@ def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
5453
-> { find_nearest_bbcode_end_index(text, size) },
5554
-> { find_text_boundary(text, size) },
5655
-> { size },
57-
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
56+
].lazy.map(&:call).compact.find { |pos| pos <= size }
5857

5958
text[0...split_point]
6059
end
@@ -64,13 +63,15 @@ def self.find_nearest_html_end_index(text, target_pos)
6463

6564
begin
6665
doc = Nokogiri::HTML5.fragment(text)
67-
current_length = 0
66+
max_length_within_target = 0
6867

6968
doc.children.each do |node|
7069
html = node.to_html
71-
end_pos = current_length + html.length
72-
return end_pos if end_pos > target_pos
73-
current_length = end_pos
70+
end_pos = max_length_within_target + html.length
71+
if (max_length_within_target > 0 && end_pos > target_pos)
72+
return max_length_within_target
73+
end
74+
max_length_within_target = end_pos
7475
end
7576
nil
7677
rescue Nokogiri::SyntaxError
@@ -79,13 +80,15 @@ def self.find_nearest_html_end_index(text, target_pos)
7980
end
8081

8182
def self.find_nearest_bbcode_end_index(text, target_pos)
83+
max_length_within_target = 0
8284
BBCODE_PATTERNS.each do |pattern|
8385
text.scan(pattern) do |_|
8486
match = $~
85-
tag_start = match.begin(0)
8687
tag_end = match.end(0)
87-
88-
return tag_end if tag_start <= target_pos && tag_end > target_pos
88+
if (max_length_within_target > 0 && tag_end > target_pos)
89+
return max_length_within_target
90+
end
91+
max_length_within_target = tag_end
8992
end
9093
end
9194

lib/translation/post_localizer.rb

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@ def self.localize(post, target_locale = I18n.locale)
1111
return if post.raw.length > SiteSetting.ai_translation_max_post_length
1212
target_locale = target_locale.to_s.sub("-", "_")
1313

14-
translated_raw =
15-
ContentSplitter
16-
.split(post.raw)
17-
.map { |text| PostRawTranslator.new(text:, target_locale:, post:).translate }
18-
.join("")
14+
translated_raw = PostRawTranslator.new(text: post.raw, target_locale:, post:).translate
1915

2016
localization =
2117
PostLocalization.find_or_initialize_by(post_id: post.id, locale: target_locale)

spec/lib/translation/content_splitter_spec.rb

Lines changed: 42 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,92 @@
11
# frozen_string_literal: true
22

33
describe DiscourseAi::Translation::ContentSplitter do
4-
let(:original_limit) { 4000 }
5-
6-
after { described_class.const_set(:CHUNK_SIZE, original_limit) }
7-
8-
def set_limit(value)
9-
described_class.const_set(:CHUNK_SIZE, value)
10-
end
11-
124
it "returns empty array for empty input" do
13-
expect(described_class.split("")).to eq([""])
5+
expect(described_class.split(content: "")).to eq([""])
146
end
157

168
it "handles content with only spaces" do
17-
expect(described_class.split(" ")).to eq([" "])
18-
expect(described_class.split(" ")).to eq([" "])
9+
expect(described_class.split(content: " ")).to eq([" "])
10+
expect(described_class.split(content: " ")).to eq([" "])
1911
end
2012

2113
it "handles nil input" do
22-
expect(described_class.split(nil)).to eq([])
14+
expect(described_class.split(content: nil)).to eq([])
2315
end
2416

2517
it "doesn't split content under limit" do
26-
text = "hello world"
27-
expect(described_class.split(text)).to eq([text])
18+
content = "hello world"
19+
expect(described_class.split(content:, chunk_size: 20)).to eq([content])
20+
end
21+
22+
it "splits to max chunk size if unsplittable" do
23+
content = "a" * 100
24+
expect(described_class.split(content:, chunk_size: 10)).to eq(["a" * 10] * 10)
2825
end
2926

3027
it "preserves HTML tags" do
31-
set_limit(10)
32-
text = "<p>hello</p><p>meow</p>"
33-
expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
28+
content = "<p>hello</p><p>meow</p>"
29+
expect(described_class.split(content:, chunk_size: 15)).to eq(%w[<p>hello</p> <p>meow</p>])
3430

35-
set_limit(35)
36-
text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
37-
expect(described_class.split(text)).to eq(
38-
["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
31+
content = "<div>hello</div> <div>jurassic</div> <p>world</p>"
32+
expect(described_class.split(content:, chunk_size: 40)).to eq(
33+
["<div>hello</div> <div>jurassic</div> ", "<p>world</p>"],
3934
)
4035
end
4136

4237
it "preserves BBCode tags" do
43-
set_limit(20)
44-
text = "[quote]hello[/quote][details]world[/details]"
45-
expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
38+
content = "[quote]hello[/quote][details]world[/details]"
39+
expect(described_class.split(content:, chunk_size: 25)).to eq(
40+
["[quote]hello[/quote]", "[details]world[/details]"],
41+
)
4642
end
4743

4844
it "doesn't split in middle of words" do
49-
set_limit(10)
50-
text = "my kitty best in the world"
51-
expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
45+
content = "my kitty best in the world"
46+
expect(described_class.split(content:, chunk_size: 10)).to eq(
47+
["my kitty ", "best in ", "the world"],
48+
)
5249
end
5350

5451
it "handles nested tags properly" do
55-
set_limit(25)
56-
text = "<div>hello<p>cat</p>world</div><p>meow</p>"
57-
expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
52+
content = "<div>hello<p>cat</p>world</div><p>meow</p>"
53+
expect(described_class.split(content:, chunk_size: 35)).to eq(
54+
%w[<div>hello<p>cat</p>world</div> <p>meow</p>],
55+
)
5856
end
5957

6058
it "handles mixed HTML and BBCode" do
61-
set_limit(15)
62-
text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
63-
expect(described_class.split(text)).to eq(
59+
content = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
60+
expect(described_class.split(content:, chunk_size: 20)).to eq(
6461
["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
6562
)
6663
end
6764

6865
it "preserves newlines in sensible places" do
69-
set_limit(10)
70-
text = "hello\nbeautiful\nworld\n"
71-
expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
66+
content = "hello\nbeautiful\nworld\n"
67+
expect(described_class.split(content:, chunk_size: 10)).to eq(
68+
["hello\n", "beautiful\n", "world\n"],
69+
)
7270
end
7371

7472
it "handles email content properly" do
75-
set_limit(20)
76-
text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
77-
expect(described_class.split(text)).to eq(
73+
content = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
74+
expect(described_class.split(content:, chunk_size: 20)).to eq(
7875
["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"],
7976
)
8077
end
8178

8279
it "keeps code blocks intact" do
83-
set_limit(30)
84-
text = "Text\n```\ncode block\nhere\n```\nmore text"
85-
expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
80+
content = "Text\n```\ncode block\nhere\n```\nmore text"
81+
expect(described_class.split(content:, chunk_size: 30)).to eq(
82+
["Text\n```\ncode block\nhere\n```\n", "more text"],
83+
)
8684
end
8785

8886
context "with multiple details tags" do
8987
it "splits correctly between details tags" do
90-
set_limit(30)
91-
text = "<details>first content</details><details>second content</details>"
92-
expect(described_class.split(text)).to eq(
88+
content = "<details>first content</details><details>second content</details>"
89+
expect(described_class.split(content:, chunk_size: 35)).to eq(
9390
["<details>first content</details>", "<details>second content</details>"],
9491
)
9592
end

0 commit comments

Comments
 (0)