diff --git a/lib/personas/locale_detector.rb b/lib/personas/locale_detector.rb index b04a54b2b..86ae927ab 100644 --- a/lib/personas/locale_detector.rb +++ b/lib/personas/locale_detector.rb @@ -32,7 +32,9 @@ def system_prompt If the language is not in this list, use the appropriate IETF language tag code. - 5. Format your response as a JSON object with a single key "locale" and the value as the language code. + 5. Avoid using `und` and prefer `en` over `en-US` or `en-GB` unless the text specifically indicates a regional variant. + + 6. Format your response as a JSON object with a single key "locale" and the value as the language code. Your output should be in the following format: @@ -52,6 +54,23 @@ def response_format def temperature 0 end + + def examples + spanish = <<~MARKDOWN + [quote] + Non smettere mai di credere nella bellezza dei tuoi sogni. Anche quando tutto sembra perduto, c'è sempre una luce che aspetta di essere trovata. + + Ogni passo, anche il più piccolo, ti avvicina a ciò che desideri. La forza che cerchi è già dentro di te. + [/quote] + + ¿Cuál es el mensaje principal de esta cita? + MARKDOWN + + [ + ["Can you tell me what '私の世界で一番好きな食べ物はちらし丼です' means?", { locale: "en" }.to_json], + [spanish, { locale: "es" }.to_json], + ] + end end end end diff --git a/lib/personas/post_raw_translator.rb b/lib/personas/post_raw_translator.rb index cb7df3053..beb052020 100644 --- a/lib/personas/post_raw_translator.rb +++ b/lib/personas/post_raw_translator.rb @@ -9,20 +9,20 @@ def self.default_enabled def system_prompt <<~PROMPT.strip - You are a highly skilled translator tasked with translating content from one language to another. Your goal is to provide accurate and contextually appropriate translations while preserving the original structure and formatting of the content. Follow these instructions carefully: + You are a highly skilled translator tasked with translating content from one language to another. Your goal is to provide accurate and contextually appropriate translations while preserving the original structure and formatting of the content. Follow these instructions strictly: - 1. Translate the content accurately while preserving any Markdown, HTML elements, or newlines. + 1. Preserve Markdown elements, HTML elements, or newlines. Text must be translated without altering the original formatting. 2. Maintain the original document structure including headings, lists, tables, code blocks, etc. 3. Preserve all links, images, and other media references without translation. - 4. Handle code snippets appropriately: - - Do not translate variable names, functions, or syntax within code blocks (```). - - Translate comments within code blocks. - 5. For technical terminology: + 4. For technical terminology: - Provide the accepted target language term if it exists. - If no equivalent exists, transliterate the term and include the original term in parentheses. - 6. For ambiguous terms or phrases, choose the most contextually appropriate translation. - 7. Do not add any content besides the translation. - 8. Ensure the translation only contains the original language and the target language. + 5. For ambiguous terms or phrases, choose the most contextually appropriate translation. + 6. Ensure the translation only contains the original language and the target language. + + Follow these instructions on what NOT to do: + 7. Do not translate code snippets or programming language names, but ensure that any comments within the code are translated. + 8. Do not add any content besides the translation. The text to translate will be provided in JSON format with the following structure: {"content": "Text to translate", "target_locale": "Target language code"} @@ -62,17 +62,6 @@ def examples }.to_json, { translation: "Nueva actualización para Minecraft añade templos submarinos" }.to_json, ], - [ - { - content: - "# Machine Learning 101\n\nMachine Learning (ML) is a subset of Artificial Intelligence (AI) that focuses on the development of algorithms and statistical models that enable computer systems to improve their performance on a specific task through experience.\n\n## Key Concepts\n\n1. **Supervised Learning**: The algorithm learns from labeled training data.\n2. **Unsupervised Learning**: The algorithm finds patterns in unlabeled data.\n3. **Reinforcement Learning**: The algorithm learns through interaction with an environment.\n\n```python\n# Simple example of a machine learning model\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\n\n# Assuming X and y are your features and target variables\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nmodel = LogisticRegression()\nmodel.fit(X_train, y_train)\n\n# Evaluate the model\naccuracy = model.score(X_test, y_test)\nprint(f'Model accuracy: {accuracy}')\n```\n\nFor more information, visit [Machine Learning on Wikipedia](https://en.wikipedia.org/wiki/Machine_learning).", - target_locale: "fr", - }.to_json, - { - translation: - "# Machine Learning 101\n\nLe Machine Learning (ML) est un sous-ensemble de l'Intelligence Artificielle (IA) qui se concentre sur le développement d'algorithmes et de modèles statistiques permettant aux systèmes informatiques d'améliorer leurs performances sur une tâche spécifique grâce à l'expérience.\n\n## Concepts clés\n\n1. **Apprentissage supervisé** : L'algorithme apprend à partir de données d'entraînement étiquetées.\n2. **Apprentissage non supervisé** : L'algorithme trouve des motifs dans des données non étiquetées.\n3. **Apprentissage par renforcement** : L'algorithme apprend à travers l'interaction avec un environnement.\n\n```python\n# Exemple simple d'un modèle de machine learning\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import LogisticRegression\n\n# En supposant que X et y sont vos variables de caractéristiques et cibles\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\nmodel = LogisticRegression()\nmodel.fit(X_train, y_train)\n\n# Évaluer le modèle\naccuracy = model.score(X_test, y_test)\nprint(f'Model accuracy: {accuracy}')\n```\n\nPour plus d'informations, visitez [Machine Learning sur Wikipedia](https://en.wikipedia.org/wiki/Machine_learning).", - }.to_json, - ], ] end end diff --git a/lib/translation/language_detector.rb b/lib/translation/language_detector.rb index bdb470f3d..8cdfe02d0 100644 --- a/lib/translation/language_detector.rb +++ b/lib/translation/language_detector.rb @@ -5,8 +5,10 @@ module Translation class LanguageDetector DETECTION_CHAR_LIMIT = 1000 - def initialize(text) + def initialize(text, topic: nil, post: nil) @text = text + @topic = topic + @post = post end def detect @@ -36,6 +38,8 @@ def detect skip_tool_details: true, feature_name: "translation", messages: [{ type: :user, content: @text }], + topic: topic, + post: post, ) structured_output = nil diff --git a/lib/translation/post_detection_text.rb b/lib/translation/post_detection_text.rb new file mode 100644 index 000000000..c5dcc6629 --- /dev/null +++ b/lib/translation/post_detection_text.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +module DiscourseAi + module Translation + class PostDetectionText + NECESSARY_REMOVAL_SELECTORS = [ + ".lightbox-wrapper", # image captions + "blockquote, aside.quote", # quotes + ] + OPTIONAL_SELECTORS = [ + "a.hashtag-cooked", # categories or tags are usually in site's language + "a.mention", # mentions are based on the mentioned's user's name + "aside.onebox", # onebox external content + "img.emoji", + "code, pre", + ] + + def self.get_text(post) + return if post.blank? + cooked = post.cooked + return if cooked.blank? + + doc = Nokogiri::HTML5.fragment(cooked) + original = doc.text.strip + + # these selectors should be removed, + # as they are the usual culprits for incorrect detection + doc.css(*NECESSARY_REMOVAL_SELECTORS).remove + necessary = doc.text.strip + + doc.css(*OPTIONAL_SELECTORS).remove + preferred = doc.text.strip + + return preferred if preferred.present? + return necessary if necessary.present? + original + end + end + end +end diff --git a/lib/translation/post_locale_detector.rb b/lib/translation/post_locale_detector.rb index 41fc5a0ac..74f625bf3 100644 --- a/lib/translation/post_locale_detector.rb +++ b/lib/translation/post_locale_detector.rb @@ -6,7 +6,8 @@ class PostLocaleDetector def self.detect_locale(post) return if post.blank? - detected_locale = LanguageDetector.new(post.raw).detect + text = PostDetectionText.get_text(post) + detected_locale = LanguageDetector.new(text, post:).detect locale = LocaleNormalizer.normalize_to_i18n(detected_locale) post.update_column(:locale, locale) locale diff --git a/lib/translation/topic_locale_detector.rb b/lib/translation/topic_locale_detector.rb index 2ad77d291..81ac30252 100644 --- a/lib/translation/topic_locale_detector.rb +++ b/lib/translation/topic_locale_detector.rb @@ -6,10 +6,7 @@ class TopicLocaleDetector def self.detect_locale(topic) return if topic.blank? - text = topic.title.dup - text << " #{topic.first_post.raw}" if topic.first_post.raw - - detected_locale = LanguageDetector.new(text).detect + detected_locale = LanguageDetector.new(topic.title.dup, topic:).detect locale = LocaleNormalizer.normalize_to_i18n(detected_locale) topic.update_column(:locale, locale) locale diff --git a/spec/lib/translation/post_detection_text_spec.rb b/spec/lib/translation/post_detection_text_spec.rb new file mode 100644 index 000000000..e6c82eb92 --- /dev/null +++ b/spec/lib/translation/post_detection_text_spec.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +describe DiscourseAi::Translation::PostDetectionText do + describe ".get_text" do + let(:post) { Fabricate.build(:post) } + + it "returns nil when post is nil" do + expect(described_class.get_text(nil)).to be_nil + end + + it "returns nil when post.cooked is nil" do + post.cooked = nil + expect(described_class.get_text(post)).to be_nil + end + + it "handles simple text" do + post.cooked = "

Hello world

" + expect(described_class.get_text(post)).to eq("Hello world") + end + + it "removes quotes" do + post.cooked = "

Hello

Quote

World

" + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes Discourse quotes" do + post.cooked = '

Hello

World

' + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes image captions" do + post.cooked = '

Hello

World

' + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes oneboxes" do + post.cooked = '

Hello

World

' + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes code blocks" do + post.cooked = "

Hello

Code block

World

" + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes hashtags" do + post.cooked = '

Hello

#hashtag

World

' + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes emoji" do + post.cooked = '

Hello

:smile:

World

' + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "removes mentions" do + post.cooked = '

Hello

@user

World

' + expect(described_class.get_text(post)).to eq("Hello World") + end + + it "falls back to necessary text when preferred is empty" do + post.cooked = '@user' + expect(described_class.get_text(post)).to eq("@user") + end + + it "falls back to cooked when all filtering removes all content" do + post.cooked = "
Quote
" + expect(described_class.get_text(post)).to eq("Quote") + end + + it "handles complex nested content correctly" do + post.cooked = + '

Hello

Quote text

World

Code block
@user' + expect(described_class.get_text(post)).to eq("Hello World") + end + end +end diff --git a/spec/lib/translation/post_locale_detector_spec.rb b/spec/lib/translation/post_locale_detector_spec.rb index 38a9cd09b..7108937da 100644 --- a/spec/lib/translation/post_locale_detector_spec.rb +++ b/spec/lib/translation/post_locale_detector_spec.rb @@ -2,12 +2,13 @@ describe DiscourseAi::Translation::PostLocaleDetector do describe ".detect_locale" do - fab!(:post) { Fabricate(:post, raw: "Hello world", locale: nil) } + fab!(:post) { Fabricate(:post, cooked: "Hello world", locale: nil) } def language_detector_stub(opts) mock = instance_double(DiscourseAi::Translation::LanguageDetector) allow(DiscourseAi::Translation::LanguageDetector).to receive(:new).with( opts[:text], + post: opts[:post], ).and_return(mock) allow(mock).to receive(:detect).and_return(opts[:locale]) end @@ -17,16 +18,16 @@ def language_detector_stub(opts) end it "updates the post locale with the detected locale" do - language_detector_stub({ text: post.raw, locale: "zh_CN" }) + language_detector_stub({ text: post.cooked, locale: "zh_CN", post: }) expect { described_class.detect_locale(post) }.to change { post.reload.locale }.from(nil).to( "zh_CN", ) end it "bypasses validations when updating locale" do - post.update_column(:raw, "A") + post.update_column(:cooked, "A") - language_detector_stub({ text: post.raw, locale: "zh_CN" }) + language_detector_stub({ text: post.cooked, locale: "zh_CN", post: }) described_class.detect_locale(post) expect(post.reload.locale).to eq("zh_CN") diff --git a/spec/lib/translation/topic_locale_detector_spec.rb b/spec/lib/translation/topic_locale_detector_spec.rb index a2be6c296..f17a9f206 100644 --- a/spec/lib/translation/topic_locale_detector_spec.rb +++ b/spec/lib/translation/topic_locale_detector_spec.rb @@ -3,12 +3,13 @@ describe DiscourseAi::Translation::TopicLocaleDetector do describe ".detect_locale" do fab!(:topic) { Fabricate(:topic, title: "this is a cat topic", locale: nil) } - fab!(:post) { Fabricate(:post, raw: "and kittens", topic:) } + fab!(:post) { Fabricate(:post, topic:) } def language_detector_stub(opts) mock = instance_double(DiscourseAi::Translation::LanguageDetector) allow(DiscourseAi::Translation::LanguageDetector).to receive(:new).with( opts[:text], + topic: opts[:topic], ).and_return(mock) allow(mock).to receive(:detect).and_return(opts[:locale]) end @@ -18,7 +19,7 @@ def language_detector_stub(opts) end it "updates the topic locale with the detected locale" do - language_detector_stub({ text: "This is a cat topic and kittens", locale: "zh_CN" }) + language_detector_stub({ text: "This is a cat topic", locale: "zh_CN", topic: }) expect { described_class.detect_locale(topic) }.to change { topic.reload.locale }.from( nil, ).to("zh_CN") @@ -29,7 +30,7 @@ def language_detector_stub(opts) SiteSetting.min_topic_title_length = 15 SiteSetting.max_topic_title_length = 16 - language_detector_stub({ text: "A and kittens", locale: "zh_CN" }) + language_detector_stub({ text: "A", locale: "zh_CN", topic: }) described_class.detect_locale(topic) expect(topic.reload.locale).to eq("zh_CN")