Skip to content

Commit e142757

Browse files
authored
FIX: Strips img tags (emojis) when detecting language (#182)
Because in a text with many emojis, the detected language will unfortunately always be 'en'
1 parent 08c8adb commit e142757

File tree

3 files changed

+38
-2
lines changed

3 files changed

+38
-2
lines changed

app/services/discourse_translator/base.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,12 @@ def self.get_text(topic_or_post)
6161
end
6262
end
6363

64+
def self.strip_img_for_detection(detection_text)
65+
html_doc = Nokogiri::HTML::DocumentFragment.parse(detection_text)
66+
html_doc.css("img").remove
67+
html_doc.to_html
68+
end
69+
6470
def self.language_supported?(detected_lang)
6571
raise NotImplementedError unless self.const_defined?(:SUPPORTED_LANG_MAPPING)
6672
supported_lang = const_get(:SUPPORTED_LANG_MAPPING)

app/services/discourse_translator/google.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,11 @@ def self.access_token
7676
end
7777

7878
def self.detect(topic_or_post)
79+
detection_text = get_text(topic_or_post).truncate(MAXLENGTH, omission: nil)
80+
detection_text = strip_img_for_detection(detection_text)
7981
topic_or_post.custom_fields[DiscourseTranslator::DETECTED_LANG_CUSTOM_FIELD] ||= result(
8082
DETECT_URI,
81-
q: get_text(topic_or_post).truncate(MAXLENGTH, omission: nil),
83+
q: detection_text,
8284
)[
8385
"detections"
8486
][

spec/services/google_spec.rb

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
end
2020

2121
describe ".detect" do
22-
let(:post) { Fabricate(:post) }
22+
fab!(:post)
2323

2424
it "should store the detected language in a custom field" do
2525
detected_lang = "en"
@@ -74,6 +74,34 @@
7474

7575
expect(described_class.detect(post)).to eq(detected_lang)
7676
end
77+
78+
it "strips img tags from detection text" do
79+
post.cooked = "there are some words <img src='http://example.com/image.jpg'> to be said"
80+
detected_lang = "en"
81+
82+
request_url = "#{DiscourseTranslator::Google::DETECT_URI}"
83+
body = { q: "there are some words to be said", key: api_key }
84+
85+
Excon
86+
.expects(:post)
87+
.with(
88+
request_url,
89+
body: URI.encode_www_form(body),
90+
headers: {
91+
"Content-Type" => "application/x-www-form-urlencoded",
92+
"Referer" => "http://test.localhost",
93+
},
94+
)
95+
.returns(
96+
mock_response.new(
97+
200,
98+
%{ { "data": { "detections": [ [ { "language": "#{detected_lang}", "isReliable": false, "confidence": 0.18397073 } ] ] } } },
99+
),
100+
)
101+
.once
102+
103+
expect(described_class.detect(post)).to eq(detected_lang)
104+
end
77105
end
78106

79107
describe ".translate_supported?" do

0 commit comments

Comments
 (0)