Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit d3b5c65

Browse files
committed
FIX: Ignore captions and quotes when detecting locale
1 parent 40fa527 commit d3b5c65

File tree

6 files changed

+127
-12
lines changed

6 files changed

+127
-12
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# frozen_string_literal: true
2+
3+
module DiscourseAi
4+
module Translation
5+
class PostDetectionText
6+
def self.get_text(post)
7+
return if post.blank?
8+
cooked = post.cooked
9+
return if cooked.blank?
10+
11+
doc = Nokogiri::HTML.fragment(cooked)
12+
original = doc.text.strip
13+
14+
# quotes and blockquotes
15+
doc.css("blockquote, aside.quote").remove
16+
# image captions
17+
doc.css(".lightbox-wrapper").remove
18+
19+
necessary = doc.text.strip
20+
21+
# oneboxes (external content)
22+
doc.css("aside.onebox").remove
23+
# code blocks
24+
doc.css("code, pre").remove
25+
# hashtags
26+
doc.css("a.hashtag-cooked").remove
27+
# emoji
28+
doc.css("img.emoji").remove
29+
# mentions
30+
doc.css("a.mention").remove
31+
32+
preferred = doc.text.strip
33+
34+
return preferred if preferred.present?
35+
return necessary if necessary.present?
36+
original
37+
end
38+
end
39+
end
40+
end

lib/translation/post_locale_detector.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ class PostLocaleDetector
66
def self.detect_locale(post)
77
return if post.blank?
88

9-
detected_locale = LanguageDetector.new(post.raw).detect
9+
text = PostDetectionText.get_text(post)
10+
detected_locale = LanguageDetector.new(text).detect
1011
locale = LocaleNormalizer.normalize_to_i18n(detected_locale)
1112
post.update_column(:locale, locale)
1213
locale

lib/translation/topic_locale_detector.rb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@ class TopicLocaleDetector
66
def self.detect_locale(topic)
77
return if topic.blank?
88

9-
text = topic.title.dup
10-
text << " #{topic.first_post.raw}" if topic.first_post.raw
11-
12-
detected_locale = LanguageDetector.new(text).detect
9+
detected_locale = LanguageDetector.new(topic.title.dup).detect
1310
locale = LocaleNormalizer.normalize_to_i18n(detected_locale)
1411
topic.update_column(:locale, locale)
1512
locale
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# frozen_string_literal: true
2+
3+
describe DiscourseAi::Translation::PostDetectionText do
4+
describe ".get_text" do
5+
let(:post) { Fabricate.build(:post) }
6+
7+
it "returns nil when post is nil" do
8+
expect(described_class.get_text(nil)).to be_nil
9+
end
10+
11+
it "returns nil when post.cooked is nil" do
12+
post.cooked = nil
13+
expect(described_class.get_text(post)).to be_nil
14+
end
15+
16+
it "handles simple text" do
17+
post.cooked = "<p>Hello world</p>"
18+
expect(described_class.get_text(post)).to eq("Hello world")
19+
end
20+
21+
it "removes quotes" do
22+
post.cooked = "<p>Hello </p><blockquote><p>Quote</p></blockquote><p>World</p>"
23+
expect(described_class.get_text(post)).to eq("Hello World")
24+
end
25+
26+
it "removes Discourse quotes" do
27+
post.cooked = '<p>Hello </p><aside class="quote"><p>Quote</p></aside><p>World</p>'
28+
expect(described_class.get_text(post)).to eq("Hello World")
29+
end
30+
31+
it "removes image captions" do
32+
post.cooked = '<p>Hello </p><div class="lightbox-wrapper">Caption text</div><p>World</p>'
33+
expect(described_class.get_text(post)).to eq("Hello World")
34+
end
35+
36+
it "removes oneboxes" do
37+
post.cooked = '<p>Hello </p><aside class="onebox">Onebox content</aside><p>World</p>'
38+
expect(described_class.get_text(post)).to eq("Hello World")
39+
end
40+
41+
it "removes code blocks" do
42+
post.cooked = "<p>Hello </p><pre><code>Code block</code></pre><p>World</p>"
43+
expect(described_class.get_text(post)).to eq("Hello World")
44+
end
45+
46+
it "removes hashtags" do
47+
post.cooked = '<p>Hello </p><a class="hashtag-cooked">#hashtag</a><p>World</p>'
48+
expect(described_class.get_text(post)).to eq("Hello World")
49+
end
50+
51+
it "removes emoji" do
52+
post.cooked = '<p>Hello </p><img class="emoji" alt=":smile:" title=":smile:"><p>World</p>'
53+
expect(described_class.get_text(post)).to eq("Hello World")
54+
end
55+
56+
it "removes mentions" do
57+
post.cooked = '<p>Hello </p><a class="mention">@user</a><p>World</p>'
58+
expect(described_class.get_text(post)).to eq("Hello World")
59+
end
60+
61+
it "falls back to necessary text when preferred is empty" do
62+
post.cooked = '<aside class="quote">Quote</aside><a class="mention">@user</a>'
63+
expect(described_class.get_text(post)).to eq("@user")
64+
end
65+
66+
it "falls back to cooked when all filtering removes all content" do
67+
post.cooked = "<blockquote>Quote</blockquote>"
68+
expect(described_class.get_text(post)).to eq("Quote")
69+
end
70+
71+
it "handles complex nested content correctly" do
72+
post.cooked =
73+
'<p>Hello </p><div class="lightbox-wrapper"><p>Image caption</p><img src="test.jpg"></div><blockquote><p>Quote text</p></blockquote><p>World</p><pre><code>Code block</code></pre><a class="mention">@user</a>'
74+
expect(described_class.get_text(post)).to eq("Hello World")
75+
end
76+
end
77+
end

spec/lib/translation/post_locale_detector_spec.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
describe DiscourseAi::Translation::PostLocaleDetector do
44
describe ".detect_locale" do
5-
fab!(:post) { Fabricate(:post, raw: "Hello world", locale: nil) }
5+
fab!(:post) { Fabricate(:post, cooked: "Hello world", locale: nil) }
66

77
def language_detector_stub(opts)
88
mock = instance_double(DiscourseAi::Translation::LanguageDetector)
@@ -17,16 +17,16 @@ def language_detector_stub(opts)
1717
end
1818

1919
it "updates the post locale with the detected locale" do
20-
language_detector_stub({ text: post.raw, locale: "zh_CN" })
20+
language_detector_stub({ text: post.cooked, locale: "zh_CN" })
2121
expect { described_class.detect_locale(post) }.to change { post.reload.locale }.from(nil).to(
2222
"zh_CN",
2323
)
2424
end
2525

2626
it "bypasses validations when updating locale" do
27-
post.update_column(:raw, "A")
27+
post.update_column(:cooked, "A")
2828

29-
language_detector_stub({ text: post.raw, locale: "zh_CN" })
29+
language_detector_stub({ text: post.cooked, locale: "zh_CN" })
3030

3131
described_class.detect_locale(post)
3232
expect(post.reload.locale).to eq("zh_CN")

spec/lib/translation/topic_locale_detector_spec.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
describe DiscourseAi::Translation::TopicLocaleDetector do
44
describe ".detect_locale" do
55
fab!(:topic) { Fabricate(:topic, title: "this is a cat topic", locale: nil) }
6-
fab!(:post) { Fabricate(:post, raw: "and kittens", topic:) }
6+
fab!(:post) { Fabricate(:post, topic:) }
77

88
def language_detector_stub(opts)
99
mock = instance_double(DiscourseAi::Translation::LanguageDetector)
@@ -18,7 +18,7 @@ def language_detector_stub(opts)
1818
end
1919

2020
it "updates the topic locale with the detected locale" do
21-
language_detector_stub({ text: "This is a cat topic and kittens", locale: "zh_CN" })
21+
language_detector_stub({ text: "This is a cat topic", locale: "zh_CN" })
2222
expect { described_class.detect_locale(topic) }.to change { topic.reload.locale }.from(
2323
nil,
2424
).to("zh_CN")
@@ -29,7 +29,7 @@ def language_detector_stub(opts)
2929
SiteSetting.min_topic_title_length = 15
3030
SiteSetting.max_topic_title_length = 16
3131

32-
language_detector_stub({ text: "A and kittens", locale: "zh_CN" })
32+
language_detector_stub({ text: "A", locale: "zh_CN" })
3333

3434
described_class.detect_locale(topic)
3535
expect(topic.reload.locale).to eq("zh_CN")

0 commit comments

Comments
 (0)