Skip to content

Commit 5dbe241

Browse files
authored
FEATURE: Better translation parsing via structured outputs (#257)
1 parent 6c2c08c commit 5dbe241

File tree

7 files changed

+197
-58
lines changed

7 files changed

+197
-58
lines changed

app/services/discourse_ai/base_translator.rb

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,49 @@ def initialize(text, target_language)
1010
def translate
1111
prompt =
1212
DiscourseAi::Completions::Prompt.new(
13-
build_prompt(@target_language),
14-
messages: [{ type: :user, content: "#{@text}", id: "user" }],
13+
prompt_template,
14+
messages: [{ type: :user, content: formatted_content, id: "user" }],
1515
)
1616

17-
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
18-
prompt,
19-
user: Discourse.system_user,
20-
feature_name: "translator-translate",
21-
)
17+
response =
18+
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
19+
prompt,
20+
user: Discourse.system_user,
21+
feature_name: "translator-translate",
22+
extra_model_params: response_format,
23+
)
24+
25+
JSON.parse(response)&.dig("translation")
2226
end
2327

24-
private
28+
def formatted_content
29+
{ content: @text, target_language: @target_language }.to_json
30+
end
2531

26-
def build_prompt(target_language)
27-
prompt_template % { target_language: target_language }
32+
def response_format
33+
{
34+
response_format: {
35+
type: "json_schema",
36+
json_schema: {
37+
name: "reply",
38+
schema: {
39+
type: "object",
40+
properties: {
41+
translation: {
42+
type: "string",
43+
},
44+
},
45+
required: ["translation"],
46+
additionalProperties: false,
47+
},
48+
strict: true,
49+
},
50+
},
51+
}
2852
end
2953

54+
private
55+
3056
def prompt_template
3157
raise NotImplementedError
3258
end

app/services/discourse_ai/language_detector.rb

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,39 @@
33
module DiscourseAi
44
class LanguageDetector
55
PROMPT_TEXT = <<~TEXT
6-
You are a language expert and will determine the locale for user-written content.
7-
- the locale is a language identifier, such as "en" for English, "de" for German, or "zh-CN" for Simplified Chinese, etc.
8-
- use the vocabulary and grammar of content to determine the locale
9-
- do not use links or code to determine the locale
10-
- do not write explanations
11-
- only return the locale
6+
You will be given a piece of text, and your task is to detect the locale (language) of the text and return it in a specific JSON format.
7+
8+
To complete this task, follow these steps:
9+
10+
1. Carefully read and analyze the provided text.
11+
2. Determine the language of the text based on its characteristics, such as vocabulary, grammar, and sentence structure.
12+
3. Do not use links or programing code in the text to detect the locale
13+
4. Identify the appropriate language code for the detected language.
14+
15+
Here is a list of common language codes for reference:
16+
- English: en
17+
- Spanish: es
18+
- French: fr
19+
- German: de
20+
- Italian: it
21+
- Brazilian Portuguese: pt-BR
22+
- Russian: ru
23+
- Simplified Chinese: zh-CN
24+
- Japanese: ja
25+
- Korean: ko
26+
27+
If the language is not in this list, use the appropriate IETF language tag code.
28+
29+
5. Format your response as a JSON object with a single key "locale" and the value as the language code.
30+
31+
Your output should be in the following format:
32+
<output>
33+
{"locale": "xx"}
34+
</output>
35+
36+
Where "xx" is replaced by the appropriate language code.
37+
38+
Important: Base your analysis solely on the provided text. Do not use any external information or make assumptions about the text's origin or context beyond what is explicitly provided.
1239
TEXT
1340

1441
def initialize(text)
@@ -22,13 +49,37 @@ def detect
2249
messages: [{ type: :user, content: @text, id: "user" }],
2350
)
2451

25-
locale =
52+
response =
2653
DiscourseAi::Completions::Llm.proxy(SiteSetting.ai_helper_model).generate(
2754
prompt,
2855
user: Discourse.system_user,
2956
feature_name: "translator-language-detect",
57+
extra_model_params: response_format,
3058
)
31-
locale&.strip
59+
60+
locale = JSON.parse(response)&.dig("locale")
61+
end
62+
63+
def response_format
64+
{
65+
response_format: {
66+
type: "json_schema",
67+
json_schema: {
68+
name: "reply",
69+
schema: {
70+
type: "object",
71+
properties: {
72+
locale: {
73+
type: "string",
74+
},
75+
},
76+
required: ["locale"],
77+
additionalProperties: false,
78+
},
79+
strict: true,
80+
},
81+
},
82+
}
3283
end
3384
end
3485
end

app/services/discourse_ai/post_translator.rb

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,37 @@
33
module DiscourseAi
44
class PostTranslator < BaseTranslator
55
PROMPT_TEMPLATE = <<~TEXT.freeze
6-
Translate this content to "%{target_language}". You must:
7-
1. Translate the content accurately while preserving any Markdown, HTML elements, or newlines
6+
You are a highly skilled translator tasked with translating content from one language to another. Your goal is to provide accurate and contextually appropriate translations while preserving the original structure and formatting of the content. Follow these instructions carefully:
7+
8+
Translation Instructions:
9+
1. Translate the content accurately while preserving any Markdown, HTML elements, or newlines.
810
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
9-
3. Preserve all links, images, and other media references without translation
10-
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
11-
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
12-
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
13-
7. Do not add any content besides the translation
14-
8. The translation must not have other languages other than the original and the target language
15-
9. You are being consumed via an API, only EVER return the translated text, do not return any other information
11+
3. Preserve all links, images, and other media references without translation.
12+
4. Handle code snippets appropriately:
13+
- Do not translate variable names, functions, or syntax within code blocks (```).
14+
- Translate comments within code blocks.
15+
5. For technical terminology:
16+
- Provide the accepted target language term if it exists.
17+
- If no equivalent exists, transliterate the term and include the original term in parentheses.
18+
6. For ambiguous terms or phrases, choose the most contextually appropriate translation.
19+
7. Do not add any content besides the translation.
20+
8. Ensure the translation only contains the original language and the target language.
21+
22+
Output your translation in the following JSON format:
23+
{"translation": "Your TARGET_LANGUAGE translation here"}
24+
25+
Here are three examples of correct translations:
26+
27+
Original: {"content":"New Update for Minecraft Adds Underwater Temples", "target_language":"Spanish"}
28+
Correct translation: {"translation": "Nueva actualización para Minecraft añade templos submarinos"}
29+
30+
Original: {"content": "# Machine Learning 101\n\nMachine Learning (ML) is a subset of Artificial Intelligence (AI) that focuses on the development of algorithms and statistical models that enable computer systems to improve their performance on a specific task through experience.\n\n## Key Concepts\n\n1. **Supervised Learning**: The algorithm learns from labeled training data.\n2. **Unsupervised Learning**: The algorithm finds patterns in unlabeled data.\n3. **Reinforcement Learning**: The algorithm learns through interaction with an environment.\n\n```python\n# Simple example of a machine learning model\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\n\n# Assuming X and y are your features and target variables\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nmodel = LogisticRegression()\nmodel.fit(X_train, y_train)\n\n# Evaluate the model\naccuracy = model.score(X_test, y_test)\nprint(f'Model accuracy: {accuracy}')\n```\n\nFor more information, visit [Machine Learning on Wikipedia](https://en.wikipedia.org/wiki/Machine_learning).", "target_language":"French"}
31+
Correct translation: {"translation": "# Machine Learning 101\n\nLe Machine Learning (ML) est un sous-ensemble de l'Intelligence Artificielle (IA) qui se concentre sur le développement d'algorithmes et de modèles statistiques permettant aux systèmes informatiques d'améliorer leurs performances sur une tâche spécifique grâce à l'expérience.\n\n## Concepts clés\n\n1. **Apprentissage supervisé** : L'algorithme apprend à partir de données d'entraînement étiquetées.\n2. **Apprentissage non supervisé** : L'algorithme trouve des motifs dans des données non étiquetées.\n3. **Apprentissage par renforcement** : L'algorithme apprend à travers l'interaction avec un environnement.\n\n```python\n# Exemple simple d'un modèle de machine learning\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\n\n# En supposant que X et y sont vos variables de caractéristiques et cibles\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nmodel = LogisticRegression()\nmodel.fit(X_train, y_train)\n\n# Évaluer le modèle\naccuracy = model.score(X_test, y_test)\nprint(f'Model accuracy: {accuracy}')\n```\n\nPour plus d'informations, visitez [Machine Learning sur Wikipedia](https://en.wikipedia.org/wiki/Machine_learning)."}
32+
33+
Original: {"content": "**Heathrow fechado**: paralisação de voos deve continuar nos próximos dias, diz gestora do aeroporto de *Londres*", "target_language": "English"}
34+
Correct translation: {"translation": "**Heathrow closed**: flight disruption expected to continue in coming days, says *London* airport management"}
35+
36+
Remember, you are being consumed via an API. Only return the translated text in the specified JSON format. Do not include any additional information or explanations in your response.
1637
TEXT
1738

1839
private def prompt_template

app/services/discourse_ai/topic_translator.rb

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,40 @@
33
module DiscourseAi
44
class TopicTranslator < BaseTranslator
55
PROMPT_TEMPLATE = <<~TEXT.freeze
6-
Translate this topic title to "%{target_language}"
7-
- Keep the original language when it is a proper noun or technical term
8-
- The translation should be around the same length as the original
9-
TEXT
6+
You are a translation service specializing in translating forum post titles from English to the asked target_language. Your task is to provide accurate and contextually appropriate translations while adhering to the following guidelines:
7+
8+
1. Translate the given title from English to target_language asked.
9+
2. Keep proper nouns and technical terms in their original language.
10+
3. Attempt to keep the translated title length close to the original when possible.
11+
4. Ensure the translation maintains the original meaning and tone.
12+
13+
To complete this task:
14+
15+
1. Read and understand the title carefully.
16+
2. Identify any proper nouns or technical terms that should remain untranslated.
17+
3. Translate the remaining words and phrases into the target_language, ensuring the meaning is preserved.
18+
4. Adjust the translation if necessary to keep the length similar to the original title.
19+
5. Review your translation for accuracy and naturalness in the target_language.
20+
21+
Provide your translation in the following JSON format:
22+
23+
<output>
24+
{"translation": "Your target_language translation here"}
25+
</output>
26+
27+
Here are three examples of correct translation
28+
29+
Original: {"title":"New Update for Minecraft Adds Underwater Temples", "target_language":"Spanish"}
30+
Correct translation: {"translation": "Nueva actualización para Minecraft añade templos submarinos"}
31+
32+
Original: {"title":"Toyota announces revolutionary battery technology", "target_language":"French"}
33+
Correct translation: {"translation": "Toyota annonce une technologie de batteries révolutionnaire"}
34+
35+
Original: {"title": "Heathrow fechado: paralisação de voos deve continuar nos próximos dias, diz gestora do aeroporto de Londres", "target_language": "English"}
36+
Correct translation: {"translation": "Heathrow closed: flight disruption expected to continue in coming days, says London airport management"}
37+
38+
Remember to keep proper nouns like "Minecraft" and "Toyota" in their original form. Translate the title now and provide your answer in the specified JSON format.
39+
TEXT
1040

1141
private def prompt_template
1242
PROMPT_TEMPLATE

spec/services/discourse_ai/base_translator_spec.rb

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,24 @@
1313
describe ".translate" do
1414
let(:text_to_translate) { "cats are great" }
1515
let(:target_language) { "de" }
16+
let(:llm_response) { "{\"translation\":\"hur dur hur dur!\"}" }
1617

1718
it "creates the correct prompt" do
19+
post_translator = DiscourseAi::PostTranslator.new(text_to_translate, target_language)
1820
allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
19-
<<~TEXT,
20-
Translate this content to "de". You must:
21-
1. Translate the content accurately while preserving any Markdown, HTML elements, or newlines
22-
2. Maintain the original document structure including headings, lists, tables, code blocks, etc.
23-
3. Preserve all links, images, and other media references without translation
24-
4. Handle code snippets appropriately - don't translate variable names, functions, or syntax within code blocks (```), but translate comments
25-
5. When encountering technical terminology, provide the accepted target language term if it exists, or transliterate if no equivalent exists, with the original term in parentheses
26-
6. For ambiguous terms or phrases, choose the most contextually appropriate translation
27-
7. Do not add any content besides the translation
28-
8. The translation must not have other languages other than the original and the target language
29-
9. You are being consumed via an API, only EVER return the translated text, do not return any other information
30-
TEXT
31-
messages: [{ type: :user, content: "cats are great", id: "user" }],
21+
DiscourseAi::PostTranslator::PROMPT_TEMPLATE,
22+
messages: [{ type: :user, content: post_translator.formatted_content, id: "user" }],
3223
).and_call_original
3324

34-
DiscourseAi::PostTranslator.new(text_to_translate, target_language).translate
25+
DiscourseAi::Completions::Llm.with_prepared_responses([llm_response]) do
26+
post_translator.translate
27+
end
3528
end
3629

3730
it "sends the translation prompt to the selected ai helper model" do
3831
mock_prompt = instance_double(DiscourseAi::Completions::Prompt)
3932
mock_llm = instance_double(DiscourseAi::Completions::Llm)
33+
post_translator = DiscourseAi::PostTranslator.new(text_to_translate, target_language)
4034

4135
allow(DiscourseAi::Completions::Prompt).to receive(:new).and_return(mock_prompt)
4236
allow(DiscourseAi::Completions::Llm).to receive(:proxy).with(
@@ -46,13 +40,14 @@
4640
mock_prompt,
4741
user: Discourse.system_user,
4842
feature_name: "translator-translate",
49-
)
43+
extra_model_params: post_translator.response_format,
44+
).and_return(llm_response)
5045

51-
DiscourseAi::PostTranslator.new(text_to_translate, target_language).translate
46+
post_translator.translate
5247
end
5348

5449
it "returns the translation from the llm's response" do
55-
DiscourseAi::Completions::Llm.with_prepared_responses(["hur dur hur dur!"]) do
50+
DiscourseAi::Completions::Llm.with_prepared_responses([llm_response]) do
5651
expect(
5752
DiscourseAi::PostTranslator.new(text_to_translate, target_language).translate,
5853
).to eq "hur dur hur dur!"

spec/services/discourse_ai/language_detector_spec.rb

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,18 @@
1111
end
1212

1313
describe ".detect" do
14+
let(:locale_detector) { described_class.new("meow") }
15+
let(:llm_response) { "{\"translation\":\"hur dur hur dur!\"}" }
16+
1417
it "creates the correct prompt" do
1518
allow(DiscourseAi::Completions::Prompt).to receive(:new).with(
1619
DiscourseAi::LanguageDetector::PROMPT_TEXT,
1720
messages: [{ type: :user, content: "meow", id: "user" }],
1821
).and_call_original
1922

20-
described_class.new("meow").detect
23+
DiscourseAi::Completions::Llm.with_prepared_responses([llm_response]) do
24+
locale_detector.detect
25+
end
2126
end
2227

2328
it "sends the language detection prompt to the ai helper model" do
@@ -32,16 +37,15 @@
3237
mock_prompt,
3338
user: Discourse.system_user,
3439
feature_name: "translator-language-detect",
35-
).and_return("hi")
40+
extra_model_params: locale_detector.response_format,
41+
).and_return(llm_response)
3642

37-
DiscourseAi::Completions::Llm.with_prepared_responses(["de"]) do
38-
described_class.new("meow").detect
39-
end
43+
locale_detector.detect
4044
end
4145

4246
it "returns the language from the llm's response in the language tag" do
43-
DiscourseAi::Completions::Llm.with_prepared_responses(["de"]) do
44-
expect(described_class.new("meow").detect).to eq "de"
47+
DiscourseAi::Completions::Llm.with_prepared_responses([llm_response]) do
48+
locale_detector.detect
4549
end
4650
end
4751
end

spec/services/discourse_ai_spec.rb

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
describe ".detect!" do
2929
it "returns the detected language" do
3030
locale = "de"
31-
DiscourseAi::Completions::Llm.with_prepared_responses(["de"]) do
31+
DiscourseAi::Completions::Llm.with_prepared_responses([locale_json(locale)]) do
3232
expect(DiscourseTranslator::DiscourseAi.detect!(post)).to eq locale
3333
end
3434
end
@@ -41,7 +41,9 @@
4141
end
4242

4343
it "translates the post and returns [locale, translated_text]" do
44-
DiscourseAi::Completions::Llm.with_prepared_responses(["some translated text"]) do
44+
DiscourseAi::Completions::Llm.with_prepared_responses(
45+
[translation_json("some translated text")],
46+
) do
4547
locale, translated_text = DiscourseTranslator::DiscourseAi.translate(post)
4648
expect(locale).to eq "de"
4749
expect(translated_text).to eq "<p>some translated text</p>"
@@ -50,11 +52,21 @@
5052

5153
it "translates the topic" do
5254
allow(::DiscourseAi::TopicTranslator).to receive(:new).and_call_original
53-
DiscourseAi::Completions::Llm.with_prepared_responses(["some translated text"]) do
55+
DiscourseAi::Completions::Llm.with_prepared_responses(
56+
[translation_json("some translated text")],
57+
) do
5458
locale, translated_text = DiscourseTranslator::DiscourseAi.translate(topic)
5559
expect(locale).to eq "de"
5660
expect(translated_text).to eq "some translated text"
5761
end
5862
end
5963
end
64+
65+
def locale_json(content)
66+
{ locale: content }.to_json
67+
end
68+
69+
def translation_json(content)
70+
{ translation: content }.to_json
71+
end
6072
end

0 commit comments

Comments
 (0)