Skip to content

Commit 6c197c5

Browse files
authored
DEV: Translate raw content instead of cooked content (#245)
Raw is in general much smaller than cooked. Cooked tends to have a lot of html when it includes things like pulled oneboxes, mentions, images. The gigantic cooked content also increases the amount of time it takes to translate content. In sending raw, we can cook the translated content after it returns from the API. This also helps with sanitisation.
1 parent 9cb68ec commit 6c197c5

20 files changed

+121
-181
lines changed

app/jobs/scheduled/automatic_translation_backfill.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def process_batch
101101
backfill_locales.each_with_index do |target_locale, i|
102102
topic_ids =
103103
fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale)
104-
post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale)
104+
post_ids = fetch_untranslated_model_ids(Post, "raw", records_to_translate, target_locale)
105105

106106
# if we end up translating fewer records than records_to_translate,
107107
# add to the value so that the next locales can have more quota

app/models/concerns/discourse_translator/translatable.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ def set_detected_locale(locale)
2424
# @param text [String] the translated text
2525
def set_translation(locale, text)
2626
locale = locale.to_s.gsub("_", "-")
27-
text = DiscourseTranslator::TranslatedContentSanitizer.sanitize(text)
2827
translations.find_or_initialize_by(locale: locale).update!(translation: text)
2928
end
3029

app/services/discourse_ai/language_detector.rb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
module DiscourseAi
44
class LanguageDetector
55
PROMPT_TEXT = <<~TEXT
6-
I want you to act as a language expert, determining the locale for a set of text.
7-
The locale is a language identifier, such as "en" for English, "de" for German, etc,
8-
and can also include a region identifier, such as "en-GB" for British English, or "zh-Hans" for Simplified Chinese.
9-
I will provide you with text, and you will determine the locale of the text.
10-
Include your locale between <language></language> XML tags.
6+
You are a language expert and will determine the locale for user-written content.
7+
- the locale is a language identifier, such as "en" for English, "de" for German, or "zh-CN" for Simplified Chinese, etc.
8+
- use the vocabulary and grammar of content to determine the locale
9+
- do not use links or code to determine the locale
10+
- do not write explanations
11+
- only return the locale
1112
TEXT
1213

1314
def initialize(text)
@@ -21,14 +22,13 @@ def detect
2122
messages: [{ type: :user, content: @text, id: "user" }],
2223
)
2324

24-
response =
25+
locale =
2526
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
2627
prompt,
2728
user: Discourse.system_user,
2829
feature_name: "translator-language-detect",
2930
)
30-
31-
(Nokogiri::HTML5.fragment(response).at("language")&.text || response)
31+
locale&.strip
3232
end
3333
end
3434
end

app/services/discourse_ai/translator.rb

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
module DiscourseAi
44
class Translator
55
PROMPT_TEMPLATE = <<~TEXT.freeze
6-
You are a highly skilled linguist of many languages and have expert knowledge in HTML.
7-
Your task is to identify the language of the text I provide and accurately translate it into this language locale "%{target_language}" while preserving the meaning, tone, and nuance of the original text.
8-
The text may or may not contain html tags. If they do, preserve them.
9-
Maintain proper grammar, spelling, and punctuation in the translated version.
10-
You will find the text between <input></input> XML tags.
11-
Include your translation between <output></output> XML tags.
12-
Do not write explanations.
6+
You are an expert translator specializing in converting Markdown content from any source language to target locale "%{target_language}". Your task is to:
7+
1. Translate the content accurately while preserving all Markdown formatting elements
8+
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
9+
3. Preserve all links, images, and other media references without translation
10+
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
11+
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
12+
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
13+
7. You are being consumed via an API, only EVER return the translated text, do not return any other information
1314
TEXT
1415

1516
def initialize(text, target_language)
@@ -21,17 +22,14 @@ def translate
2122
prompt =
2223
DiscourseAi::Completions::Prompt.new(
2324
build_prompt(@target_language),
24-
messages: [{ type: :user, content: "<input>#{@text}</input>", id: "user" }],
25+
messages: [{ type: :user, content: "#{@text}", id: "user" }],
2526
)
2627

27-
llm_translation =
28-
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
29-
prompt,
30-
user: Discourse.system_user,
31-
feature_name: "translator-translate",
32-
)
33-
34-
(Nokogiri::HTML5.fragment(llm_translation).at("output")&.inner_html || "").strip
28+
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
29+
prompt,
30+
user: Discourse.system_user,
31+
feature_name: "translator-translate",
32+
)
3533
end
3634

3735
private

app/services/discourse_translator/base.rb

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def self.translate(translatable, target_locale_sym = I18n.locale)
3333
detected_lang = detect(translatable)
3434

3535
if translatable.locale_matches?(target_locale_sym)
36-
return detected_lang, get_untranslated(translatable)
36+
return detected_lang, get_untranslated_cooked(translatable)
3737
end
3838

3939
translation = translatable.translation_for(target_locale_sym)
@@ -50,7 +50,9 @@ def self.translate(translatable, target_locale_sym = I18n.locale)
5050
end
5151

5252
translated = translate!(translatable, target_locale_sym)
53-
save_translation(translatable, target_locale_sym) { translated }
53+
save_translation(translatable, target_locale_sym) do
54+
TranslatedContentNormalizer.normalize(translatable, translated)
55+
end
5456
[detected_lang, translated]
5557
end
5658

@@ -122,25 +124,25 @@ def self.translate_supported?(detected_lang, target_lang)
122124

123125
private
124126

125-
def self.strip_tags_for_detection(detection_text)
126-
html_doc = Nokogiri::HTML::DocumentFragment.parse(detection_text)
127-
html_doc.css("img", "aside.quote", "div.lightbox-wrapper", "a.mention,a.lightbox").remove
128-
html_doc.to_html
129-
end
130-
131127
def self.text_for_detection(translatable)
132-
strip_tags_for_detection(get_untranslated(translatable)).truncate(
133-
DETECTION_CHAR_LIMIT,
134-
omission: nil,
135-
)
128+
get_untranslated_raw(translatable).truncate(DETECTION_CHAR_LIMIT, omission: nil)
136129
end
137130

138131
def self.text_for_translation(translatable)
139132
max_char = SiteSetting.max_characters_per_translation
140-
get_untranslated(translatable).truncate(max_char, omission: nil)
133+
get_untranslated_raw(translatable).truncate(max_char, omission: nil)
134+
end
135+
136+
def self.get_untranslated_raw(translatable)
137+
case translatable.class.name
138+
when "Post"
139+
translatable.raw
140+
when "Topic"
141+
translatable.title
142+
end
141143
end
142144

143-
def self.get_untranslated(translatable)
145+
def self.get_untranslated_cooked(translatable)
144146
case translatable.class.name
145147
when "Post"
146148
translatable.cooked
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseTranslator
4+
class TranslatedContentNormalizer
5+
def self.normalize(translatable, content)
6+
case translatable.class.name
7+
when "Post"
8+
PrettyText.cook(content)
9+
when "Topic"
10+
PrettyText.cleanup(content, {})
11+
end
12+
end
13+
end
14+
end

lib/discourse_translator/translated_content_sanitizer.rb

Lines changed: 0 additions & 9 deletions
This file was deleted.

lib/discourse_translator/translator_selection_validator.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ def valid_value?(val)
2020
def error_message
2121
return I18n.t("translator.discourse_ai.not_installed") if !defined?(::DiscourseAi)
2222

23-
I18n.t("translator.discourse_ai.ai_helper_required") if !SiteSetting.ai_helper_enabled
23+
if !SiteSetting.ai_helper_enabled
24+
I18n.t("translator.discourse_ai.ai_helper_required", { base_url: Discourse.base_url })
25+
end
2426
end
2527
end
2628
end

spec/jobs/automatic_translation_backfill_spec.rb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def expect_google_translate(text)
102102
described_class.new.execute
103103

104104
expect(topic.translations.pluck(:locale, :translation)).to eq([%w[es hola]])
105-
expect(post.translations.pluck(:locale, :translation)).to eq([%w[de hallo]])
105+
expect(post.translations.pluck(:locale, :translation)).to eq([%w[de <p>hallo</p>]])
106106
end
107107
end
108108

@@ -126,7 +126,7 @@ def expect_google_translate(text)
126126

127127
expect(topic.translations.pluck(:locale, :translation)).to eq([%w[de hallo]])
128128
expect(posts.map { |p| p.translations.pluck(:locale, :translation).flatten }).to eq(
129-
[%w[de hallo]] * 4,
129+
[%w[de <p>hallo</p>]] * 4,
130130
)
131131
end
132132
end
@@ -176,27 +176,27 @@ def expect_google_translate(text)
176176
post_1.update!(updated_at: 2.days.ago)
177177
post_2.update!(updated_at: 3.days.ago)
178178

179-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
179+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
180180
expect(result).to include(post_6.id, post_1.id, post_2.id)
181181
end
182182

183183
it "does not return posts that are deleted" do
184184
post_1.trash!
185-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
185+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
186186
expect(result).not_to include(post_1.id)
187187
end
188188

189189
it "does not return posts that are empty" do
190-
post_1.cooked = ""
190+
post_1.raw = ""
191191
post_1.save!(validate: false)
192-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
192+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
193193
expect(result).not_to include(post_1.id)
194194
end
195195

196196
it "does not return posts by bots" do
197197
post_1.update(user: Discourse.system_user)
198198

199-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
199+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, "de")
200200

201201
expect(result).not_to include(post_1.id)
202202
end
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
describe DiscourseTranslator::TranslatedContentNormalizer do
4+
fab!(:post)
5+
fab!(:topic)
6+
7+
it "normalizes the content" do
8+
expect(
9+
DiscourseTranslator::TranslatedContentNormalizer.normalize(
10+
post,
11+
"<script>alert('test')</script><p> <h1>Testing</h1> This is a test post</p>",
12+
),
13+
).to eq("<p> </p><h1>Testing</h1> This is a test post<p></p>")
14+
15+
expect(
16+
DiscourseTranslator::TranslatedContentNormalizer.normalize(
17+
topic,
18+
"<script>alert('test')</script><p> <h1>Testing</h1> This is a test post</p>",
19+
),
20+
).to eq("<p> </p><h1>Testing</h1> This is a test post<p></p>")
21+
end
22+
end

0 commit comments

Comments
 (0)