Skip to content

Commit ef612a4

Browse files
committed
DEV: Translate post raw instead of post cooked
1 parent 0babb90 commit ef612a4

20 files changed

+116
-179
lines changed

app/jobs/scheduled/automatic_translation_backfill.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def process_batch
102102
translations_per_model = [translations_per_run / models_translated, 1].max
103103
topic_ids = fetch_untranslated_model_ids(Topic, "title", translations_per_model)
104104
translations_per_model = translations_per_run - topic_ids.size
105-
post_ids = fetch_untranslated_model_ids(Post, "cooked", translations_per_model)
105+
post_ids = fetch_untranslated_model_ids(Post, "raw", translations_per_model)
106106

107107
DiscourseTranslator::VerboseLogger.log(
108108
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",

app/models/concerns/discourse_translator/translatable.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ def set_detected_locale(locale)
2424
# @param text [String] the translated text
2525
def set_translation(locale, text)
2626
locale = locale.to_s.gsub("_", "-")
27-
text = DiscourseTranslator::TranslatedContentSanitizer.sanitize(text)
2827
translations.find_or_initialize_by(locale: locale).update!(translation: text)
2928
end
3029

app/services/discourse_ai/language_detector.rb

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
module DiscourseAi
44
class LanguageDetector
55
PROMPT_TEXT = <<~TEXT
6-
I want you to act as a language expert, determining the locale for a set of text.
7-
The locale is a language identifier, such as "en" for English, "de" for German, etc,
8-
and can also include a region identifier, such as "en-GB" for British English, or "zh-Hans" for Simplified Chinese.
9-
I will provide you with text, and you will determine the locale of the text.
10-
Include your locale between <language></language> XML tags.
6+
You are a language expert. Determine the locale for a set of text.
7+
- the locale is a language identifier, such as "en" for English, "de" for German, etc
8+
- it may include a region identifier, such as "en-GB" for British English, or "zh-CN" for Simplified Chinese
9+
- only return the locale
10+
- do not write explanations
1111
TEXT
1212

1313
def initialize(text)
@@ -21,14 +21,13 @@ def detect
2121
messages: [{ type: :user, content: @text, id: "user" }],
2222
)
2323

24-
response =
24+
locale =
2525
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
2626
prompt,
2727
user: Discourse.system_user,
2828
feature_name: "translator-language-detect",
2929
)
30-
31-
(Nokogiri::HTML5.fragment(response).at("language")&.text || response)
30+
locale.strip
3231
end
3332
end
3433
end

app/services/discourse_ai/translator.rb

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
module DiscourseAi
44
class Translator
55
PROMPT_TEMPLATE = <<~TEXT.freeze
6-
You are a highly skilled linguist of many languages and have expert knowledge in HTML.
7-
Your task is to identify the language of the text I provide and accurately translate it into this language locale "%{target_language}" while preserving the meaning, tone, and nuance of the original text.
8-
The text may or may not contain html tags. If they do, preserve them.
9-
Maintain proper grammar, spelling, and punctuation in the translated version.
10-
You will find the text between <input></input> XML tags.
11-
Include your translation between <output></output> XML tags.
12-
Do not write explanations.
6+
You are an expert translator specializing in converting Markdown content from any source language to target locale "%{target_language}". Your task is to:
7+
1. Translate the content accurately while preserving all Markdown formatting elements
8+
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
9+
3. Preserve all links, images, and other media references without translation
10+
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
11+
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
12+
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
13+
7. You are being consumed via an API, only EVER return the translated text, do not return any other information
1314
TEXT
1415

1516
def initialize(text, target_language)
@@ -21,17 +22,14 @@ def translate
2122
prompt =
2223
DiscourseAi::Completions::Prompt.new(
2324
build_prompt(@target_language),
24-
messages: [{ type: :user, content: "<input>#{@text}</input>", id: "user" }],
25+
messages: [{ type: :user, content: "#{@text}", id: "user" }],
2526
)
2627

27-
llm_translation =
28-
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
29-
prompt,
30-
user: Discourse.system_user,
31-
feature_name: "translator-translate",
32-
)
33-
34-
(Nokogiri::HTML5.fragment(llm_translation).at("output")&.inner_html || "").strip
28+
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
29+
prompt,
30+
user: Discourse.system_user,
31+
feature_name: "translator-translate",
32+
)
3533
end
3634

3735
private

app/services/discourse_translator/base.rb

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def self.translate(translatable, target_locale_sym = I18n.locale)
3333
detected_lang = detect(translatable)
3434

3535
if translatable.locale_matches?(target_locale_sym)
36-
return detected_lang, get_untranslated(translatable)
36+
return detected_lang, get_untranslated_cooked(translatable)
3737
end
3838

3939
translation = translatable.translation_for(target_locale_sym)
@@ -50,7 +50,9 @@ def self.translate(translatable, target_locale_sym = I18n.locale)
5050
end
5151

5252
translated = translate!(translatable, target_locale_sym)
53-
save_translation(translatable, target_locale_sym) { translated }
53+
save_translation(translatable, target_locale_sym) do
54+
TranslatedContentNormalizer.normalize(translatable, translated)
55+
end
5456
[detected_lang, translated]
5557
end
5658

@@ -122,25 +124,25 @@ def self.translate_supported?(detected_lang, target_lang)
122124

123125
private
124126

125-
def self.strip_tags_for_detection(detection_text)
126-
html_doc = Nokogiri::HTML::DocumentFragment.parse(detection_text)
127-
html_doc.css("img", "aside.quote", "div.lightbox-wrapper", "a.mention,a.lightbox").remove
128-
html_doc.to_html
129-
end
130-
131127
def self.text_for_detection(translatable)
132-
strip_tags_for_detection(get_untranslated(translatable)).truncate(
133-
DETECTION_CHAR_LIMIT,
134-
omission: nil,
135-
)
128+
get_untranslated_raw(translatable).truncate(DETECTION_CHAR_LIMIT, omission: nil)
136129
end
137130

138131
def self.text_for_translation(translatable)
139132
max_char = SiteSetting.max_characters_per_translation
140-
get_untranslated(translatable).truncate(max_char, omission: nil)
133+
get_untranslated_raw(translatable).truncate(max_char, omission: nil)
134+
end
135+
136+
def self.get_untranslated_raw(translatable)
137+
case translatable.class.name
138+
when "Post"
139+
translatable.raw
140+
when "Topic"
141+
translatable.title
142+
end
141143
end
142144

143-
def self.get_untranslated(translatable)
145+
def self.get_untranslated_cooked(translatable)
144146
case translatable.class.name
145147
when "Post"
146148
translatable.cooked
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseTranslator
4+
class TranslatedContentNormalizer
5+
def self.normalize(translatable, content)
6+
case translatable.class.name
7+
when "Post"
8+
PrettyText.cook(content)
9+
when "Topic"
10+
PrettyText.cleanup(content, {})
11+
end
12+
end
13+
end
14+
end

lib/discourse_translator/translated_content_sanitizer.rb

Lines changed: 0 additions & 9 deletions
This file was deleted.

lib/discourse_translator/translator_selection_validator.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ def valid_value?(val)
2020
def error_message
2121
return I18n.t("translator.discourse_ai.not_installed") if !defined?(::DiscourseAi)
2222

23-
I18n.t("translator.discourse_ai.ai_helper_required") if !SiteSetting.ai_helper_enabled
23+
if !SiteSetting.ai_helper_enabled
24+
I18n.t("translator.discourse_ai.ai_helper_required", { base_url: Discourse.base_url })
25+
end
2426
end
2527
end
2628
end

spec/jobs/automatic_translation_backfill_spec.rb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def expect_google_translate(text)
102102
described_class.new.execute
103103

104104
expect(topic.translations.pluck(:locale, :translation)).to eq([%w[es hola]])
105-
expect(post.translations.pluck(:locale, :translation)).to eq([%w[de hallo]])
105+
expect(post.translations.pluck(:locale, :translation)).to eq([%w[de <p>hallo</p>]])
106106
end
107107
end
108108

@@ -126,7 +126,7 @@ def expect_google_translate(text)
126126

127127
expect(topic.translations.pluck(:locale, :translation)).to eq([%w[de hallo]])
128128
expect(posts.map { |p| p.translations.pluck(:locale, :translation).flatten }).to eq(
129-
[%w[de hallo]] * 4,
129+
[%w[de <p>hallo</p>]] * 4,
130130
)
131131
end
132132
end
@@ -186,27 +186,27 @@ def expect_google_translate(text)
186186
post_1.update!(updated_at: 4.days.ago)
187187
post_7.update!(updated_at: 5.days.ago)
188188

189-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
189+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, %w[de es])
190190
expect(result).to include(post_6.id, post_3.id, post_2.id, post_1.id, post_7.id)
191191
end
192192

193193
it "does not return posts that are deleted" do
194194
post_1.trash!
195-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
195+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, %w[de es])
196196
expect(result).not_to include(post_1.id)
197197
end
198198

199199
it "does not return posts that are empty" do
200-
post_1.cooked = ""
200+
post_1.raw = ""
201201
post_1.save!(validate: false)
202-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
202+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, %w[de es])
203203
expect(result).not_to include(post_1.id)
204204
end
205205

206206
it "does not return posts by bots" do
207207
post_1.update(user: Discourse.system_user)
208208

209-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
209+
result = described_class.new.fetch_untranslated_model_ids(Post, "raw", 50, %w[de es])
210210

211211
expect(result).not_to include(post_1.id)
212212
end
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
describe DiscourseTranslator::TranslatedContentNormalizer do
4+
fab!(:post)
5+
fab!(:topic)
6+
7+
it "normalizes the content" do
8+
expect(
9+
DiscourseTranslator::TranslatedContentNormalizer.normalize(
10+
post,
11+
"<script>alert('test')</script><p> <h1>Testing</h1> This is a test post</p>",
12+
),
13+
).to eq("<p> </p><h1>Testing</h1> This is a test post<p></p>")
14+
15+
expect(
16+
DiscourseTranslator::TranslatedContentNormalizer.normalize(
17+
topic,
18+
"<script>alert('test')</script><p> <h1>Testing</h1> This is a test post</p>",
19+
),
20+
).to eq("<p> </p><h1>Testing</h1> This is a test post<p></p>")
21+
end
22+
end

0 commit comments

Comments
 (0)