Skip to content

Commit 7efffc4

Browse files
committed
FIX: Split raw content to prevent job from timing out
1 parent bd57bb7 commit 7efffc4

File tree

6 files changed

+233
-23
lines changed

6 files changed

+233
-23
lines changed

app/services/discourse_ai/translator.rb

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@ module DiscourseAi
44
class Translator
55
PROMPT_TEMPLATE = <<~TEXT.freeze
66
You are an expert translator specializing in converting Markdown content from any source language to target locale "%{target_language}". Your task is to:
7-
1. Translate the content accurately while preserving all Markdown formatting elements
8-
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
9-
3. Preserve all links, images, and other media references without translation
10-
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
11-
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
12-
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
13-
7. You are being consumed via an API, only EVER return the translated text, do not return any other information
7+
1. Accurately translate text only
8+
2. Preserve all Markdown formatting elements, including incomplete ones
9+
3. Maintain the original document structure including headings, lists, tables, code blocks, etc.
10+
4. Preserve all links, images, and other media references without translation
11+
5. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
12+
6. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
13+
7. For ambiguous terms or phrases, choose the most contextually appropriate translation
14+
8. You are being consumed via an API, only EVER return the translated text, do not return any other information
1415
TEXT
1516

1617
def initialize(text, target_language)

app/services/discourse_translator/discourse_ai.rb

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
# frozen_string_literal: true
22

3-
require_relative "base"
4-
require "json"
5-
63
module DiscourseTranslator
74
class DiscourseAi < Base
85
MAX_DETECT_LOCALE_TEXT_LENGTH = 1000
@@ -33,7 +30,12 @@ def self.translate!(translatable, target_locale_sym = I18n.locale)
3330
),
3431
)
3532
end
36-
::DiscourseAi::Translator.new(text_for_translation(translatable), target_locale_sym).translate
33+
34+
text = text_for_translation(translatable)
35+
chunks = DiscourseTranslator::ContentSplitter.split(text)
36+
chunks
37+
.map { |chunk| ::DiscourseAi::Translator.new(chunk, target_locale_sym).translate }
38+
.join("")
3739
end
3840

3941
private
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseTranslator
4+
class ContentSplitter
5+
CHUNK_SIZE = 3000
6+
7+
BBCODE_PATTERNS = [
8+
%r{\[table.*?\].*?\[/table\]}m,
9+
%r{\[quote.*?\].*?\[/quote\]}m,
10+
%r{\[details.*?\].*?\[/details\]}m,
11+
%r{\<details.*?\>.*?\</details\>}m,
12+
%r{\[spoiler.*?\].*?\[/spoiler\]}m,
13+
%r{\[code.*?\].*?\[/code\]}m,
14+
/```.*?```/m,
15+
].freeze
16+
17+
TEXT_BOUNDARIES = [
18+
/\n\s*\n\s*|\r\n\s*\r\n\s*/, # double newlines with optional spaces
19+
/[.!?]\s+/, # sentence endings
20+
/[,;]\s+/, # clause endings
21+
/\n|\r\n/, # single newlines
22+
/\s+/, # any whitespace
23+
].freeze
24+
25+
def self.split(content)
26+
return [] if content.nil?
27+
return [""] if content.empty?
28+
return [content] if content.length <= CHUNK_SIZE
29+
30+
chunks = []
31+
remaining = content.dup
32+
33+
while remaining.present?
34+
chunk = extract_mixed_chunk(remaining)
35+
break if chunk.empty?
36+
chunks << chunk
37+
remaining = remaining[chunk.length..-1]
38+
end
39+
40+
chunks
41+
end
42+
43+
private
44+
45+
def self.extract_mixed_chunk(text, size: CHUNK_SIZE)
46+
return text if text.length <= size
47+
flexible_size = size * 1.5
48+
49+
# try each splitting strategy in order
50+
split_point =
51+
[
52+
-> { find_nearest_html_end_index(text, size) },
53+
-> { find_nearest_bbcode_end_index(text, size) },
54+
-> { find_text_boundary(text, size) },
55+
-> { size },
56+
].lazy.map(&:call).compact.find { |pos| pos <= flexible_size }
57+
58+
text[0...split_point]
59+
end
60+
61+
def self.find_nearest_html_end_index(text, target_pos)
62+
return nil unless text.include?("<")
63+
64+
begin
65+
doc = Nokogiri::HTML.fragment(text)
66+
current_length = 0
67+
68+
doc.children.each do |node|
69+
html = node.to_html
70+
end_pos = current_length + html.length
71+
return end_pos if end_pos > target_pos
72+
current_length = end_pos
73+
end
74+
nil
75+
rescue Nokogiri::SyntaxError
76+
nil
77+
end
78+
end
79+
80+
def self.find_nearest_bbcode_end_index(text, target_pos)
81+
BBCODE_PATTERNS.each do |pattern|
82+
text.scan(pattern) do |_|
83+
match = $~
84+
tag_start = match.begin(0)
85+
tag_end = match.end(0)
86+
87+
return tag_end if tag_start <= target_pos && tag_end > target_pos
88+
end
89+
end
90+
91+
nil
92+
end
93+
94+
def self.find_text_boundary(text, target_pos)
95+
search_text = text
96+
97+
TEXT_BOUNDARIES.each do |pattern|
98+
if pos = search_text.rindex(pattern, target_pos)
99+
# Include all trailing whitespace
100+
pos += 1 while pos < search_text.length && search_text[pos].match?(/\s/)
101+
return pos
102+
end
103+
end
104+
nil
105+
end
106+
end
107+
end

spec/lib/content_splitter_spec.rb

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# frozen_string_literal: true
2+
3+
require "rails_helper"
4+
5+
describe DiscourseTranslator::ContentSplitter do
6+
let(:original_limit) { 4000 }
7+
8+
after { described_class.const_set(:CHUNK_SIZE, original_limit) }
9+
10+
def set_limit(value)
11+
described_class.const_set(:CHUNK_SIZE, value)
12+
end
13+
14+
it "returns empty array for empty input" do
15+
expect(described_class.split("")).to eq([""])
16+
end
17+
18+
it "handles nil input" do
19+
expect(described_class.split(nil)).to eq([])
20+
end
21+
22+
it "doesn't split content under limit" do
23+
text = "hello world"
24+
expect(described_class.split(text)).to eq([text])
25+
end
26+
27+
it "preserves HTML tags" do
28+
set_limit(10)
29+
text = "<p>hello</p><p>meow</p>"
30+
expect(described_class.split(text)).to eq(%w[<p>hello</p> <p>meow</p>])
31+
32+
set_limit(35)
33+
text = "<div>hello</div> <div>jurassic</div> <p>world</p>"
34+
expect(described_class.split(text)).to eq(
35+
["<div>hello</div> <div>jurassic</div>", " <p>world</p>"],
36+
)
37+
end
38+
39+
it "preserves BBCode tags" do
40+
set_limit(20)
41+
text = "[quote]hello[/quote][details]world[/details]"
42+
expect(described_class.split(text)).to eq(["[quote]hello[/quote]", "[details]world[/details]"])
43+
end
44+
45+
it "doesn't split in middle of words" do
46+
set_limit(10)
47+
text = "my kitty best in the world"
48+
expect(described_class.split(text)).to eq(["my kitty ", "best in ", "the world"])
49+
end
50+
51+
it "handles nested tags properly" do
52+
set_limit(25)
53+
text = "<div>hello<p>cat</p>world</div><p>meow</p>"
54+
expect(described_class.split(text)).to eq(%w[<div>hello<p>cat</p>world</div> <p>meow</p>])
55+
end
56+
57+
it "handles mixed HTML and BBCode" do
58+
set_limit(15)
59+
text = "<div>hello</div>[quote]world[/quote]<p>beautiful</p>"
60+
expect(described_class.split(text)).to eq(
61+
["<div>hello</div>", "[quote]world[/quote]", "<p>beautiful</p>"],
62+
)
63+
end
64+
65+
it "preserves newlines in sensible places" do
66+
set_limit(10)
67+
text = "hello\nbeautiful\nworld\n"
68+
expect(described_class.split(text)).to eq(["hello\n", "beautiful\n", "world\n"])
69+
end
70+
71+
it "handles email content properly" do
72+
set_limit(20)
73+
text = "From: [email protected]\nTo: [email protected]\nSubject: Hello\n\nContent here"
74+
expect(described_class.split(text)).to eq(
75+
["From: [email protected]\n", "To: [email protected]\n", "Subject: Hello\n\n", "Content here"],
76+
)
77+
end
78+
79+
it "keeps code blocks intact" do
80+
set_limit(30)
81+
text = "Text\n```\ncode block\nhere\n```\nmore text"
82+
expect(described_class.split(text)).to eq(["Text\n```\ncode block\nhere\n```\n", "more text"])
83+
end
84+
85+
context "with multiple details tags" do
86+
it "splits correctly between details tags" do
87+
set_limit(30)
88+
text = "<details>first content</details><details>second content</details>"
89+
expect(described_class.split(text)).to eq(
90+
["<details>first content</details>", "<details>second content</details>"],
91+
)
92+
end
93+
end
94+
end

spec/services/discourse_ai/translator_spec.rb

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@
1818
allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
1919
<<~TEXT,
2020
You are an expert translator specializing in converting Markdown content from any source language to target locale "de". Your task is to:
21-
1. Translate the content accurately while preserving all Markdown formatting elements
22-
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
23-
3. Preserve all links, images, and other media references without translation
24-
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
25-
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
26-
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
27-
7. You are being consumed via an API, only EVER return the translated text, do not return any other information
21+
1. Accurately translate text only
22+
2. Preserve all Markdown formatting elements, including incomplete ones
23+
3. Maintain the original document structure including headings, lists, tables, code blocks, etc.
24+
4. Preserve all links, images, and other media references without translation
25+
5. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
26+
6. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
27+
7. For ambiguous terms or phrases, choose the most contextually appropriate translation
28+
8. You are being consumed via an API, only EVER return the translated text, do not return any other information
2829
TEXT
2930
messages: [{ type: :user, content: "cats are great", id: "user" }],
3031
).and_call_original

spec/services/discourse_ai_spec.rb

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,19 @@
3535
end
3636
end
3737

38-
describe ".translate" do
38+
describe ".translate!" do
3939
before { post.set_detected_locale("de") }
4040

41-
it "translates the post and returns [locale, translated_text]" do
41+
it "returns the translated text from the llm" do
4242
DiscourseAi::Completions::Llm.with_prepared_responses(["some translated text"]) do
43-
locale, translated_text = DiscourseTranslator::DiscourseAi.translate(post)
44-
expect(locale).to eq "de"
45-
expect(translated_text).to eq "some translated text"
43+
expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "some translated text"
44+
end
45+
end
46+
47+
it "sends the content for splitting and the split content for translation" do
48+
post.update(raw: "#{"a" * 3000} #{"b" * 3000}")
49+
DiscourseAi::Completions::Llm.with_prepared_responses(%w[lol wut]) do
50+
expect(DiscourseTranslator::DiscourseAi.translate!(post)).to eq "lolwut"
4651
end
4752
end
4853
end

0 commit comments

Comments
 (0)