Allow rejection of postings written in a particular alphabet.

ianmacd · jagerman · commit 251b1ebd089a · 2022-09-30T21:42:50.000-03:00
The user configures in `sogs.ini` a list of alphabets to reject by
default:

`alphabet_filters = [ arabic, cyrillic, persian ]`

If there are rooms in which these alphabets should be allowed, a list of
room ids per alphabet may be specified for whitelisting.

`alphabet_whitelist_arabic = [ 8 ]`
`alphabet_whitelist_cyrillic = [ 5, 13 ]`
`alphabet_whitelist_persian = [ 18 ]`

A list of valid room ids can be obtained with the following SQL:

`$ sudo sqlite3 /var/lib/session-open-group-server/sogs.db 'SELECT * FROM rooms;'`

The implementation can easily be extended to more alphabets.
diff --git a/sogs.ini.sample b/sogs.ini.sample
@@ -115,6 +115,20 @@
 ;
 ;profanity_custom =
 
+; Whether we should reject messages that use a particular alphabet.
+;
+;alphabet_filters = [ arabic, cyrillic, persian ]
+;alphabet_filters = []
+
+
+; If we reject messages written in a given alphabet, we should still allow them in
+; specific rooms. A list of whitelisted room ids can be given here, as returned by
+; `SELECT * FROM rooms;`. An empty list means to reject from all rooms.
+;
+;alphabet_whitelist_arabic = []
+;alphabet_whitelist_cyrillic = []
+;alphabet_whitelist_persian = []
+
 
 [web]
 
diff --git a/sogs/config.py b/sogs/config.py
@@ -32,6 +32,10 @@
 PROFANITY_FILTER = False
 PROFANITY_SILENT = True
 PROFANITY_CUSTOM = None
+ALPHABET_FILTERS = []
+ALPHABET_WHITELIST_ARABIC = []
+ALPHABET_WHITELIST_CYRILLIC = []
+ALPHABET_WHITELIST_PERSIAN = []
 REQUIRE_BLIND_KEYS = False
 TEMPLATE_PATH = 'templates'
 STATIC_PATH = 'static'
@@ -83,6 +87,12 @@ def days_to_seconds(v):
     def days_to_seconds_or_none(v):
         return days_to_seconds(v) if v else None
 
+    def list_of_strs(v):
+        return re.split('[,\s]+', value[1:-1].strip())
+
+    def list_of_ints(v):
+        return [int(i) for i in list_of_strs(v)]
+
     truthy = ('y', 'yes', 'Y', 'Yes', 'true', 'True', 'on', 'On', '1')
     falsey = ('n', 'no', 'N', 'No', 'false', 'False', 'off', 'Off', '0')
     booly = truthy + falsey
@@ -126,6 +136,10 @@ def bool_opt(name):
             'profanity_filter': bool_opt('PROFANITY_FILTER'),
             'profanity_silent': bool_opt('PROFANITY_SILENT'),
             'profanity_custom': ('PROFANITY_CUSTOM', path_exists, val_or_none),
+            'alphabet_filters': ('ALPHABET_FILTERS', None, list_of_strs),
+            'alphabet_whitelist_arabic': ('ALPHABET_WHITELIST_ARABIC', None, list_of_ints),
+            'alphabet_whitelist_cyrillic': ('ALPHABET_WHITELIST_CYRILLIC', None, list_of_ints),
+            'alphabet_whitelist_persian': ('ALPHABET_WHITELIST_PERSIAN', None, list_of_ints),
         },
         'web': {
             'template_path': ('TEMPLATE_PATH', path_exists, val_or_none),
diff --git a/sogs/model/room.py b/sogs/model/room.py
@@ -699,28 +699,45 @@ def get_messages_for(
 
     def should_filter(self, user: User, data: bytes):
         """
-        Checks a message for profanity (if the profanity filter is enabled).
+        Checks a message for disallowed alphabets and profanity (if the profanity
+        filter is enabled).
 
-        - Returns False if this message passes (i.e. didn't trigger the profanity filter, or is
-          being posted by an admin to whom the filter doesn't apply).
+        - Returns False if this message passes (i.e. didn't trigger any filter, or is
+          being posted by an admin to whom the filters don't apply).
 
         Otherwise, depending on the filtering configuration:
         - Returns True if this message should be silently accepted but filtered (i.e. not shown to
           users).
         - Throws PostRejected if the message should be rejected (and rejection passed back to the
           user).
         """
+        msg = Post(raw=data)
+
+        if 'arabic' in config.ALPHABET_FILTERS and not self.check_admin(user):
+            arabic_alpha_codepoints = '[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]'
+            if not self.id in config.ALPHABET_WHITELIST_ARABIC and re.search(arabic_alpha_codepoints, msg.text):
+                raise PostRejected("filtration rejected Arabic message")
+
+        if 'cyrillic' in config.ALPHABET_FILTERS and not self.check_admin(user):
+            cyrillic_alpha_codepoints = '[\u0400-\u04ff]'
+            if not self.id in config.ALPHABET_WHITELIST_CYRILLIC and re.search(cyrillic_alpha_codepoints, msg.text):
+                raise PostRejected("filtration rejected Cyrillic message")
+
+        if 'persian' in config.ALPHABET_FILTERS and not self.check_admin(user):
+            persian_alpha_codepoints = '[\u0621-\u0628\u062a-\u063a\u0641-\u0642\u0644-\u0648\u064e-\u0651\u0655\u067e\u0686\u0698\u06a9\u06af\u06be\u06cc]'
+            if not self.id in config.ALPHABET_WHITELIST_PERSIAN and re.search(persian_alpha_codepoints, msg.text):
+                raise PostRejected("filtration rejected Persian message")
+
         if config.PROFANITY_FILTER and not self.check_admin(user):
             import better_profanity
 
-            msg = Post(raw=data)
             for part in (msg.text, msg.username):
                 if better_profanity.profanity.contains_profanity(part):
                     if config.PROFANITY_SILENT:
                         return True
                     else:
                         # FIXME: can we send back some error code that makes Session not retry?
-                        raise PostRejected("filtration rejected message")
+                        raise PostRejected("filtration rejected profane message")
         return False
 
     def _own_files(self, msg_id: int, files: List[int], user):