33module DiscourseAi
44 module Translation
55 class PostDetectionText
6+ NECESSARY_REMOVAL_SELECTORS = [
7+ ".lightbox-wrapper" , # image captions
8+ "blockquote, aside.quote" , # quotes
9+ ]
10+ OPTIONAL_SELECTORS = [
11+ "a.hashtag-cooked" , # categories or tags are usually in site's language
12+ "a.mention" , # mentions are based on the mentioned's user's name
13+ "aside.onebox" , # onebox external content
14+ "img.emoji" ,
15+ "code, pre" ,
16+ ]
17+
618 def self . get_text ( post )
719 return if post . blank?
820 cooked = post . cooked
@@ -11,24 +23,12 @@ def self.get_text(post)
1123 doc = Nokogiri ::HTML5 . fragment ( cooked )
1224 original = doc . text . strip
1325
14- # quotes and blockquotes
15- doc . css ( "blockquote, aside.quote" ) . remove
16- # image captions
17- doc . css ( ".lightbox-wrapper" ) . remove
18-
26+ # these selectors should be removed,
27+ # as they are the usual culprits for incorrect detection
28+ doc . css ( *NECESSARY_REMOVAL_SELECTORS ) . remove
1929 necessary = doc . text . strip
2030
21- # oneboxes (external content)
22- doc . css ( "aside.onebox" ) . remove
23- # code blocks
24- doc . css ( "code, pre" ) . remove
25- # hashtags
26- doc . css ( "a.hashtag-cooked" ) . remove
27- # emoji
28- doc . css ( "img.emoji" ) . remove
29- # mentions
30- doc . css ( "a.mention" ) . remove
31-
31+ doc . css ( *OPTIONAL_SELECTORS ) . remove
3232 preferred = doc . text . strip
3333
3434 return preferred if preferred . present?
0 commit comments