Skip to content

Commit 352493d

Browse files
authored
FEATURE: Add max character for translation setting (#183)
When translating a post, most of our available providers have an arbitrary character limit. In Google, this arbitrary limit is 5000 characters. This commit removes the arbitrary limits per provider by moving it to a site setting max_characters_per_translation, and leaves the hard limits set by each provider (Amazon 10k, Azure 50k).
1 parent e142757 commit 352493d

File tree

10 files changed

+102
-29
lines changed

10 files changed

+102
-29
lines changed

app/services/discourse_translator/amazon.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def self.access_token_key
108108
end
109109

110110
def self.detect(topic_or_post)
111-
text = truncate get_text(topic_or_post)
111+
text = truncate text_for_detection(topic_or_post)
112112

113113
return if text.blank?
114114

@@ -133,7 +133,7 @@ def self.translate(topic_or_post)
133133
result =
134134
client.translate_text(
135135
{
136-
text: truncate(get_text(topic_or_post)),
136+
text: truncate(text_for_translation(topic_or_post)),
137137
source_language_code: "auto",
138138
target_language_code: SUPPORTED_LANG_MAPPING[I18n.locale],
139139
},

app/services/discourse_translator/base.rb

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ class ProblemCheckedTranslationError < TranslatorError
1010
end
1111

1212
class Base
13+
DETECTION_CHAR_LIMIT = 1000
14+
1315
def self.key_prefix
1416
"#{PLUGIN_NAME}:".freeze
1517
end
@@ -61,17 +63,27 @@ def self.get_text(topic_or_post)
6163
end
6264
end
6365

66+
def self.language_supported?(detected_lang)
67+
raise NotImplementedError unless self.const_defined?(:SUPPORTED_LANG_MAPPING)
68+
supported_lang = const_get(:SUPPORTED_LANG_MAPPING)
69+
return false if supported_lang[I18n.locale].nil?
70+
detected_lang != supported_lang[I18n.locale]
71+
end
72+
73+
private
74+
6475
def self.strip_img_for_detection(detection_text)
6576
html_doc = Nokogiri::HTML::DocumentFragment.parse(detection_text)
6677
html_doc.css("img").remove
6778
html_doc.to_html
6879
end
6980

70-
def self.language_supported?(detected_lang)
71-
raise NotImplementedError unless self.const_defined?(:SUPPORTED_LANG_MAPPING)
72-
supported_lang = const_get(:SUPPORTED_LANG_MAPPING)
73-
return false if supported_lang[I18n.locale].nil?
74-
detected_lang != supported_lang[I18n.locale]
81+
def self.text_for_detection(topic_or_post)
82+
strip_img_for_detection(get_text(topic_or_post).truncate(DETECTION_CHAR_LIMIT, omission: nil))
83+
end
84+
85+
def self.text_for_translation(topic_or_post)
86+
get_text(topic_or_post).truncate(SiteSetting.max_characters_per_translation, omission: nil)
7587
end
7688
end
7789
end

app/services/discourse_translator/google.rb

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ class Google < Base
88
TRANSLATE_URI = "https://www.googleapis.com/language/translate/v2".freeze
99
DETECT_URI = "https://www.googleapis.com/language/translate/v2/detect".freeze
1010
SUPPORT_URI = "https://www.googleapis.com/language/translate/v2/languages".freeze
11-
MAXLENGTH = 5000
1211

1312
# Hash which maps Discourse's locale code to Google Translate's locale code found in
1413
# https://cloud.google.com/translate/docs/languages
@@ -76,11 +75,9 @@ def self.access_token
7675
end
7776

7877
def self.detect(topic_or_post)
79-
detection_text = get_text(topic_or_post).truncate(MAXLENGTH, omission: nil)
80-
detection_text = strip_img_for_detection(detection_text)
8178
topic_or_post.custom_fields[DiscourseTranslator::DETECTED_LANG_CUSTOM_FIELD] ||= result(
8279
DETECT_URI,
83-
q: detection_text,
80+
q: text_for_detection(topic_or_post),
8481
)[
8582
"detections"
8683
][
@@ -115,7 +112,7 @@ def self.translate(topic_or_post)
115112
res =
116113
result(
117114
TRANSLATE_URI,
118-
q: get_text(topic_or_post).truncate(MAXLENGTH, omission: nil),
115+
q: text_for_translation(topic_or_post),
119116
source: detected_lang,
120117
target: SUPPORTED_LANG_MAPPING[I18n.locale],
121118
)

app/services/discourse_translator/libre_translate.rb

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55

66
module DiscourseTranslator
77
class LibreTranslate < Base
8-
MAXLENGTH = 5000
9-
108
SUPPORTED_LANG_MAPPING = {
119
en: "en",
1210
en_GB: "en",
@@ -84,11 +82,7 @@ def self.detect(topic_or_post)
8482
res =
8583
result(
8684
detect_uri,
87-
q:
88-
ActionController::Base
89-
.helpers
90-
.strip_tags(get_text(topic_or_post))
91-
.truncate(MAXLENGTH, omission: nil),
85+
q: ActionController::Base.helpers.strip_tags(text_for_detection(topic_or_post)),
9286
)
9387

9488
if !res.empty?
@@ -116,7 +110,7 @@ def self.translate(topic_or_post)
116110
res =
117111
result(
118112
translate_uri,
119-
q: get_text(topic_or_post).truncate(MAXLENGTH, omission: nil),
113+
q: text_for_translation(topic_or_post),
120114
source: detected_lang,
121115
target: SUPPORTED_LANG_MAPPING[I18n.locale],
122116
format: "html",

app/services/discourse_translator/microsoft.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,7 @@ def self.access_token_key
9696

9797
def self.detect(topic_or_post)
9898
topic_or_post.custom_fields[DiscourseTranslator::DETECTED_LANG_CUSTOM_FIELD] ||= begin
99-
text = get_text(topic_or_post).truncate(LENGTH_LIMIT, omission: nil)
100-
101-
body = [{ "Text" => text }].to_json
99+
body = [{ "Text" => text_for_detection(topic_or_post) }].to_json
102100

103101
uri = URI(detect_endpoint)
104102
uri.query = URI.encode_www_form(self.default_query)
@@ -125,7 +123,7 @@ def self.translate(topic_or_post)
125123
from_custom_fields(topic_or_post) do
126124
query = default_query.merge("from" => detected_lang, "to" => locale, "textType" => "html")
127125

128-
body = [{ "Text" => get_text(topic_or_post) }].to_json
126+
body = [{ "Text" => text_for_translation(topic_or_post) }].to_json
129127

130128
uri = URI(translate_endpoint)
131129
uri.query = URI.encode_www_form(query)

app/services/discourse_translator/yandex.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def self.access_token
125125

126126
def self.detect(topic_or_post)
127127
topic_or_post.custom_fields[DiscourseTranslator::DETECTED_LANG_CUSTOM_FIELD] ||= begin
128-
query = default_query.merge("text" => get_text(topic_or_post))
128+
query = default_query.merge("text" => text_for_detection(topic_or_post))
129129

130130
uri = URI(DETECT_URI)
131131
uri.query = URI.encode_www_form(query)
@@ -149,7 +149,7 @@ def self.translate(topic_or_post)
149149
query =
150150
default_query.merge(
151151
"lang" => "#{detected_lang}-#{locale}",
152-
"text" => get_text(topic_or_post),
152+
"text" => text_for_translation(topic_or_post),
153153
"format" => "html",
154154
)
155155

config/locales/server.en.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ en:
66
translator_azure_region: "Azure Region"
77
translator_google_api_key: "Google API Key"
88
translator_yandex_api_key: "Yandex API Key"
9+
max_characters_per_translation: "The maximum number of characters that can be sent for translation. If content is longer than this, text will be truncated. Note that each provider also has their own limits."
910
max_translations_per_minute: "The number of translations per minute a regular user can perform."
1011
translator_libretranslate_endpoint: "LibreTranslate API Endpoint"
1112
translator_libretranslate_api_key: "LibreTranslate API Key"

config/settings.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ discourse_translator:
8787
default: ''
8888
translator_libretranslate_api_key:
8989
default: ''
90+
max_characters_per_translation:
91+
default: 5000
92+
client: true
9093
max_translations_per_minute:
9194
default: 3
9295
restrict_translation_by_group:

spec/services/base_spec.rb

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,35 @@ class EmptyTranslator < DiscourseTranslator::Base
3232
expect(TestTranslator.language_supported?("pt")).to eq(false)
3333
end
3434
end
35+
36+
describe ".text_for_detection" do
37+
fab!(:post)
38+
39+
it "strips img tags" do
40+
post.cooked = "<img src='http://example.com/image.png' />"
41+
expect(DiscourseTranslator::Base.text_for_detection(post)).to eq("")
42+
end
43+
44+
it "truncates to DETECTION_CHAR_LIMIT of 1000" do
45+
post.cooked = "a" * 1001
46+
expect(DiscourseTranslator::Base.text_for_detection(post).length).to eq(1000)
47+
end
48+
49+
it "returns the text if it's less than DETECTION_CHAR_LIMIT" do
50+
text = "a" * 999
51+
post.cooked = text
52+
expect(DiscourseTranslator::Base.text_for_detection(post)).to eq(text)
53+
end
54+
end
55+
56+
describe ".text_for_translation" do
57+
fab!(:post)
58+
59+
it "truncates to max_characters_per_translation" do
60+
post.cooked = "a" * (SiteSetting.max_characters_per_translation + 1)
61+
expect(DiscourseTranslator::Base.text_for_translation(post).length).to eq(
62+
SiteSetting.max_characters_per_translation,
63+
)
64+
end
65+
end
3566
end

spec/services/google_spec.rb

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,14 @@
4343
end
4444
end
4545

46-
it "should truncate string to 5000 characters" do
47-
length = 6000
46+
it "should truncate string to 1000 characters" do
47+
length = 2000
4848
post.cooked = rand(36**length).to_s(36)
4949
detected_lang = "en"
5050

5151
request_url = "#{DiscourseTranslator::Google::DETECT_URI}"
5252
body = {
53-
q: post.cooked.truncate(DiscourseTranslator::Google::MAXLENGTH, omission: nil),
53+
q: post.cooked.truncate(DiscourseTranslator::Google::DETECTION_CHAR_LIMIT, omission: nil),
5454
key: api_key,
5555
}
5656

@@ -166,5 +166,42 @@
166166

167167
expect { described_class.translate(post) }.to raise_error DiscourseTranslator::TranslatorError
168168
end
169+
170+
it "truncates text for translation to max_characters_per_translation setting" do
171+
SiteSetting.max_characters_per_translation = 50
172+
post.cooked = "a" * 100
173+
post.custom_fields[DiscourseTranslator::DETECTED_LANG_CUSTOM_FIELD] = "de"
174+
post.save_custom_fields
175+
body = {
176+
q: post.cooked.truncate(SiteSetting.max_characters_per_translation, omission: nil),
177+
source: "de",
178+
target: "en",
179+
key: api_key,
180+
}
181+
182+
translated_text = "hur dur hur dur"
183+
Excon
184+
.expects(:post)
185+
.with(
186+
DiscourseTranslator::Google::TRANSLATE_URI,
187+
body: URI.encode_www_form(body),
188+
headers: {
189+
"Content-Type" => "application/x-www-form-urlencoded",
190+
"Referer" => "http://test.localhost",
191+
},
192+
)
193+
.returns(
194+
mock_response.new(
195+
200,
196+
%{ { "data": { "translations": [ { "translatedText": "#{translated_text}" } ] } } },
197+
),
198+
)
199+
.once
200+
Excon.expects(:post).returns(
201+
mock_response.new(200, %{ { "data": { "languages": [ { "language": "de" }] } } }),
202+
)
203+
204+
expect(described_class.translate(post)).to eq(["de", translated_text])
205+
end
169206
end
170207
end

0 commit comments

Comments
 (0)