Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/jobs/regular/detect_translate_post.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def execute(args)
# do nothing, there are too many sporadic lookup failures
rescue => e
DiscourseAi::Translation::VerboseLogger.log(
"Failed to translate post #{post.id} to #{locale}: #{e.message}",
"Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
)
end
end
Expand Down
2 changes: 1 addition & 1 deletion app/jobs/regular/detect_translate_topic.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def execute(args)
# do nothing, there are too many sporadic lookup failures
rescue => e
DiscourseAi::Translation::VerboseLogger.log(
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
)
end
end
Expand Down
2 changes: 1 addition & 1 deletion app/jobs/regular/localize_categories.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def execute(args)
# do nothing, there are too many sporadic lookup failures
rescue => e
DiscourseAi::Translation::VerboseLogger.log(
"Failed to translate category #{category.id} to #{locale}: #{e.message}",
"Failed to translate category #{category.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
)
ensure
remaining_limit -= 1
Expand Down
2 changes: 1 addition & 1 deletion app/jobs/regular/localize_posts.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def execute(args)
# do nothing, there are too many sporadic lookup failures
rescue => e
DiscourseAi::Translation::VerboseLogger.log(
"Failed to translate post #{post.id} to #{locale}: #{e.message}",
"Failed to translate post #{post.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
)
end
end
Expand Down
2 changes: 1 addition & 1 deletion app/jobs/regular/localize_topics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def execute(args)
# do nothing, there are too many sporadic lookup failures
rescue => e
DiscourseAi::Translation::VerboseLogger.log(
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}",
"Failed to translate topic #{topic.id} to #{locale}: #{e.message}\n\n#{e.backtrace[0..3].join("\n")}",
)
end
end
Expand Down
36 changes: 19 additions & 17 deletions lib/translation/base_translator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,34 @@ def translate
if (ai_persona = AiPersona.find_by(id: persona_setting)).blank?
return nil
end

translation_user = ai_persona.user || Discourse.system_user
persona_klass = ai_persona.class_instance
persona = persona_klass.new

llm_model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
return nil if llm_model.blank?
model = LlmModel.find_by(id: preferred_llm_model(persona_klass))
return nil if model.blank?

bot =
DiscourseAi::Personas::Bot.as(
ai_persona.user || Discourse.system_user,
persona: persona,
model: llm_model,
)
bot = DiscourseAi::Personas::Bot.as(translation_user, persona:, model:)

ContentSplitter
.split(content: @text, chunk_size: model.max_output_tokens)
.map { |text| get_translation(text:, bot:, translation_user:) }
.join("")
end

private

def formatted_content(content)
{ content:, target_locale: @target_locale }.to_json
end

def get_translation(text:, bot:, translation_user:)
context =
DiscourseAi::Personas::BotContext.new(
user: ai_persona.user || Discourse.system_user,
user: translation_user,
skip_tool_details: true,
feature_name: "translation",
messages: [{ type: :user, content: formatted_content }],
messages: [{ type: :user, content: formatted_content(text) }],
topic: @topic,
post: @post,
)
Expand All @@ -47,12 +55,6 @@ def translate
structured_output&.read_buffered_property(:translation)
end

def formatted_content
{ content: @text, target_locale: @target_locale }.to_json
end

private

def persona_setting
raise NotImplementedError
end
Expand Down
32 changes: 18 additions & 14 deletions lib/translation/content_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module DiscourseAi
module Translation
class ContentSplitter
CHUNK_SIZE = 3000
DEFAULT_CHUNK_SIZE = 8192

BBCODE_PATTERNS = [
%r{\[table.*?\].*?\[/table\]}m,
Expand All @@ -23,16 +23,17 @@ class ContentSplitter
/\s+/, # any whitespace
].freeze

def self.split(content)
def self.split(content:, chunk_size: DEFAULT_CHUNK_SIZE)
return [] if content.nil?
return [""] if content.empty?
return [content] if content.length <= CHUNK_SIZE
chunk_size ||= DEFAULT_CHUNK_SIZE
return [content] if content.length <= chunk_size

chunks = []
remaining = content.dup

while remaining.present?
chunk = extract_mixed_chunk(remaining)
chunk = extract_mixed_chunk(remaining, size: chunk_size)
break if chunk.empty?
chunks << chunk
remaining = remaining[chunk.length..-1]
Expand All @@ -43,9 +44,8 @@ def self.split(content)

private

def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
def self.extract_mixed_chunk(text, size:)
return text if text.length <= size
flexible_size = size * 1.5

# try each splitting strategy in order
split_point =
Expand All @@ -54,7 +54,7 @@ def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
-> { find_nearest_bbcode_end_index(text, size) },
-> { find_text_boundary(text, size) },
-> { size },
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
].lazy.map(&:call).compact.find { |pos| pos <= size }

text[0...split_point]
end
Expand All @@ -64,13 +64,15 @@ def self.find_nearest_html_end_index(text, target_pos)

begin
doc = Nokogiri::HTML5.fragment(text)
current_length = 0
max_length_within_target = 0

doc.children.each do |node|
html = node.to_html
end_pos = current_length + html.length
return end_pos if end_pos > target_pos
current_length = end_pos
end_pos = max_length_within_target + html.length
if (max_length_within_target > 0 && end_pos > target_pos)
return max_length_within_target
end
max_length_within_target = end_pos
end
nil
rescue Nokogiri::SyntaxError
Expand All @@ -79,13 +81,15 @@ def self.find_nearest_html_end_index(text, target_pos)
end

def self.find_nearest_bbcode_end_index(text, target_pos)
max_length_within_target = 0
BBCODE_PATTERNS.each do |pattern|
text.scan(pattern) do |_|
match = $~
tag_start = match.begin(0)
tag_end = match.end(0)

return tag_end if tag_start <= target_pos && tag_end > target_pos
if (max_length_within_target > 0 && tag_end > target_pos)
return max_length_within_target
end
max_length_within_target = tag_end
end
end

Expand Down
6 changes: 1 addition & 5 deletions lib/translation/post_localizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@ def self.localize(post, target_locale = I18n.locale)
return if post.raw.length > SiteSetting.ai_translation_max_post_length
target_locale = target_locale.to_s.sub("-", "_")

translated_raw =
ContentSplitter
.split(post.raw)
.map { |text| PostRawTranslator.new(text:, target_locale:, post:).translate }
.join("")
translated_raw = PostRawTranslator.new(text: post.raw, target_locale:, post:).translate

localization =
PostLocalization.find_or_initialize_by(post_id: post.id, locale: target_locale)
Expand Down
4 changes: 1 addition & 3 deletions spec/lib/translation/base_translator_spec.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# frozen_string_literal: true

require "rails_helper"

describe DiscourseAi::Translation::BaseTranslator do
let!(:persona) do
AiPersona.find(
Expand All @@ -28,7 +26,7 @@
DiscourseAi::Translation::PostRawTranslator.new(text:, target_locale:, post:)
allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
persona.system_prompt,
messages: array_including({ type: :user, content: post_translator.formatted_content }),
messages: array_including({ type: :user, content: a_string_including(text) }),
post_id: post.id,
topic_id: post.topic_id,
).and_call_original
Expand Down
87 changes: 42 additions & 45 deletions spec/lib/translation/content_splitter_spec.rb
Original file line number Diff line number Diff line change
@@ -1,95 +1,92 @@
# frozen_string_literal: true

describe DiscourseAi::Translation::ContentSplitter do
let(:original_limit) { 4000 }

after { described_class.const_set(:CHUNK_SIZE, original_limit) }

def set_limit(value)
described_class.const_set(:CHUNK_SIZE, value)
end

it "returns empty array for empty input" do
expect(described_class.split("")).to eq([""])
expect(described_class.split(content: "")).to eq([""])
end

it "handles content with only spaces" do
expect(described_class.split(" ")).to eq([" "])
expect(described_class.split(" ")).to eq([" "])
expect(described_class.split(content: " ")).to eq([" "])
expect(described_class.split(content: " ")).to eq([" "])
end

it "handles nil input" do
expect(described_class.split(nil)).to eq([])
expect(described_class.split(content: nil)).to eq([])
end

it "doesn't split content under limit" do
text = "hello world"
expect(described_class.split(text)).to eq([text])
content = "hello world"
expect(described_class.split(content:, chunk_size: 20)).to eq([content])
end

it "splits to max chunk size if unsplittable" do
content = "a" * 100
expect(described_class.split(content:, chunk_size: 10)).to eq(["a" * 10] * 10)
end

it "preserves HTML tags" do
set_limit(10)
text = "<p>hello</p><p>meow</p>"
expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
content = "<p>hello</p><p>meow</p>"
expect(described_class.split(content:, chunk_size: 15)).to eq(%w[<p>hello</p> <p>meow</p>])

set_limit(35)
text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
expect(described_class.split(text)).to eq(
["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
content = "<div>hello</div> <div>jurassic</div> <p>world</p>"
expect(described_class.split(content:, chunk_size: 40)).to eq(
["<div>hello</div> <div>jurassic</div> ", "<p>world</p>"],
)
end

it "preserves BBCode tags" do
set_limit(20)
text = "[quote]hello[/quote][details]world[/details]"
expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
content = "[quote]hello[/quote][details]world[/details]"
expect(described_class.split(content:, chunk_size: 25)).to eq(
["[quote]hello[/quote]", "[details]world[/details]"],
)
end

it "doesn't split in middle of words" do
set_limit(10)
text = "my kitty best in the world"
expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
content = "my kitty best in the world"
expect(described_class.split(content:, chunk_size: 10)).to eq(
["my kitty ", "best in ", "the world"],
)
end

it "handles nested tags properly" do
set_limit(25)
text = "<div>hello<p>cat</p>world</div><p>meow</p>"
expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
content = "<div>hello<p>cat</p>world</div><p>meow</p>"
expect(described_class.split(content:, chunk_size: 35)).to eq(
%w[<div>hello<p>cat</p>world</div> <p>meow</p>],
)
end

it "handles mixed HTML and BBCode" do
set_limit(15)
text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
expect(described_class.split(text)).to eq(
content = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
expect(described_class.split(content:, chunk_size: 20)).to eq(
["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
)
end

it "preserves newlines in sensible places" do
set_limit(10)
text = "hello\nbeautiful\nworld\n"
expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
content = "hello\nbeautiful\nworld\n"
expect(described_class.split(content:, chunk_size: 10)).to eq(
["hello\n", "beautiful\n", "world\n"],
)
end

it "handles email content properly" do
set_limit(20)
text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
expect(described_class.split(text)).to eq(
content = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
expect(described_class.split(content:, chunk_size: 20)).to eq(
["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"],
)
end

it "keeps code blocks intact" do
set_limit(30)
text = "Text\n```\ncode block\nhere\n```\nmore text"
expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
content = "Text\n```\ncode block\nhere\n```\nmore text"
expect(described_class.split(content:, chunk_size: 30)).to eq(
["Text\n```\ncode block\nhere\n```\n", "more text"],
)
end

context "with multiple details tags" do
it "splits correctly between details tags" do
set_limit(30)
text = "<details>first content</details><details>second content</details>"
expect(described_class.split(text)).to eq(
content = "<details>first content</details><details>second content</details>"
expect(described_class.split(content:, chunk_size: 35)).to eq(
["<details>first content</details>", "<details>second content</details>"],
)
end
Expand Down
Loading