33module DiscourseAi
44 module Translation
55 class PostDetectionText
6+ SELECTORS = [
7+ "a.hashtag-cooked" , # categories or tags are usually in site's language
8+ "a.mention" , # mentions are based on the mentioned's user's name
9+ "aside.onebox" , # onebox external content
10+ ".lightbox-wrapper" , # image captions
11+ "blockquote, aside.quote" ,
12+ "img.emoji" ,
13+ "code, pre" ,
14+ ]
15+
616 def self . get_text ( post )
717 return if post . blank?
818 cooked = post . cooked
@@ -11,29 +21,12 @@ def self.get_text(post)
1121 doc = Nokogiri ::HTML5 . fragment ( cooked )
1222 original = doc . text . strip
1323
14- # quotes and blockquotes
15- doc . css ( "blockquote, aside.quote" ) . remove
16- # image captions
17- doc . css ( ".lightbox-wrapper" ) . remove
18-
24+ # these selectors are preferred to be removed,
25+ # as they may not be in the user's language
26+ doc . css ( *SELECTORS ) . remove
1927 necessary = doc . text . strip
2028
21- # oneboxes (external content)
22- doc . css ( "aside.onebox" ) . remove
23- # code blocks
24- doc . css ( "code, pre" ) . remove
25- # hashtags
26- doc . css ( "a.hashtag-cooked" ) . remove
27- # emoji
28- doc . css ( "img.emoji" ) . remove
29- # mentions
30- doc . css ( "a.mention" ) . remove
31-
32- preferred = doc . text . strip
33-
34- return preferred if preferred . present?
35- return necessary if necessary . present?
36- original
29+ necessary . present? ? necessary : original
3730 end
3831 end
3932 end
0 commit comments