Skip to content

Commit 251b1eb

Browse files
ianmacdjagerman
authored andcommitted
Allow rejection of postings written in a particular alphabet.
The user configures in `sogs.ini` a list of alphabets to reject by default: `alphabet_filters = [ arabic, cyrillic, persian ]` If there are rooms in which these alphabets should be allowed, a list of room ids per alphabet may be specified for whitelisting. `alphabet_whitelist_arabic = [ 8 ]` `alphabet_whitelist_cyrillic = [ 5, 13 ]` `alphabet_whitelist_persian = [ 18 ]` A list of valid room ids can be obtained with the following SQL: `$ sudo sqlite3 /var/lib/session-open-group-server/sogs.db 'SELECT * FROM rooms;'` The implementation can easily be extended to more alphabets.
1 parent 6893d2a commit 251b1eb

File tree

3 files changed

+50
-5
lines changed

3 files changed

+50
-5
lines changed

sogs.ini.sample

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,20 @@
115115
;
116116
;profanity_custom =
117117

118+
; Whether we should reject messages that use a particular alphabet.
119+
;
120+
;alphabet_filters = [ arabic, cyrillic, persian ]
121+
;alphabet_filters = []
122+
123+
124+
; If we reject messages written in a given alphabet, we should still allow them in
125+
; specific rooms. A list of whitelisted room ids can be given here, as returned by
126+
; `SELECT * FROM rooms;`. An empty list means to reject from all rooms.
127+
;
128+
;alphabet_whitelist_arabic = []
129+
;alphabet_whitelist_cyrillic = []
130+
;alphabet_whitelist_persian = []
131+
118132

119133
[web]
120134

sogs/config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
PROFANITY_FILTER = False
3333
PROFANITY_SILENT = True
3434
PROFANITY_CUSTOM = None
35+
ALPHABET_FILTERS = []
36+
ALPHABET_WHITELIST_ARABIC = []
37+
ALPHABET_WHITELIST_CYRILLIC = []
38+
ALPHABET_WHITELIST_PERSIAN = []
3539
REQUIRE_BLIND_KEYS = False
3640
TEMPLATE_PATH = 'templates'
3741
STATIC_PATH = 'static'
@@ -83,6 +87,12 @@ def days_to_seconds(v):
8387
def days_to_seconds_or_none(v):
8488
return days_to_seconds(v) if v else None
8589

90+
def list_of_strs(v):
91+
return re.split('[,\s]+', value[1:-1].strip())
92+
93+
def list_of_ints(v):
94+
return [int(i) for i in list_of_strs(v)]
95+
8696
truthy = ('y', 'yes', 'Y', 'Yes', 'true', 'True', 'on', 'On', '1')
8797
falsey = ('n', 'no', 'N', 'No', 'false', 'False', 'off', 'Off', '0')
8898
booly = truthy + falsey
@@ -126,6 +136,10 @@ def bool_opt(name):
126136
'profanity_filter': bool_opt('PROFANITY_FILTER'),
127137
'profanity_silent': bool_opt('PROFANITY_SILENT'),
128138
'profanity_custom': ('PROFANITY_CUSTOM', path_exists, val_or_none),
139+
'alphabet_filters': ('ALPHABET_FILTERS', None, list_of_strs),
140+
'alphabet_whitelist_arabic': ('ALPHABET_WHITELIST_ARABIC', None, list_of_ints),
141+
'alphabet_whitelist_cyrillic': ('ALPHABET_WHITELIST_CYRILLIC', None, list_of_ints),
142+
'alphabet_whitelist_persian': ('ALPHABET_WHITELIST_PERSIAN', None, list_of_ints),
129143
},
130144
'web': {
131145
'template_path': ('TEMPLATE_PATH', path_exists, val_or_none),

sogs/model/room.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -699,28 +699,45 @@ def get_messages_for(
699699

700700
def should_filter(self, user: User, data: bytes):
701701
"""
702-
Checks a message for profanity (if the profanity filter is enabled).
702+
Checks a message for disallowed alphabets and profanity (if the profanity
703+
filter is enabled).
703704
704-
- Returns False if this message passes (i.e. didn't trigger the profanity filter, or is
705-
being posted by an admin to whom the filter doesn't apply).
705+
- Returns False if this message passes (i.e. didn't trigger any filter, or is
706+
being posted by an admin to whom the filters don't apply).
706707
707708
Otherwise, depending on the filtering configuration:
708709
- Returns True if this message should be silently accepted but filtered (i.e. not shown to
709710
users).
710711
- Throws PostRejected if the message should be rejected (and rejection passed back to the
711712
user).
712713
"""
714+
msg = Post(raw=data)
715+
716+
if 'arabic' in config.ALPHABET_FILTERS and not self.check_admin(user):
717+
arabic_alpha_codepoints = '[\u0600-\u06ff\u0750-\u077f\u08a0-\u08ff\ufb50-\ufdff\ufe70-\ufeff]'
718+
if not self.id in config.ALPHABET_WHITELIST_ARABIC and re.search(arabic_alpha_codepoints, msg.text):
719+
raise PostRejected("filtration rejected Arabic message")
720+
721+
if 'cyrillic' in config.ALPHABET_FILTERS and not self.check_admin(user):
722+
cyrillic_alpha_codepoints = '[\u0400-\u04ff]'
723+
if not self.id in config.ALPHABET_WHITELIST_CYRILLIC and re.search(cyrillic_alpha_codepoints, msg.text):
724+
raise PostRejected("filtration rejected Cyrillic message")
725+
726+
if 'persian' in config.ALPHABET_FILTERS and not self.check_admin(user):
727+
persian_alpha_codepoints = '[\u0621-\u0628\u062a-\u063a\u0641-\u0642\u0644-\u0648\u064e-\u0651\u0655\u067e\u0686\u0698\u06a9\u06af\u06be\u06cc]'
728+
if not self.id in config.ALPHABET_WHITELIST_PERSIAN and re.search(persian_alpha_codepoints, msg.text):
729+
raise PostRejected("filtration rejected Persian message")
730+
713731
if config.PROFANITY_FILTER and not self.check_admin(user):
714732
import better_profanity
715733

716-
msg = Post(raw=data)
717734
for part in (msg.text, msg.username):
718735
if better_profanity.profanity.contains_profanity(part):
719736
if config.PROFANITY_SILENT:
720737
return True
721738
else:
722739
# FIXME: can we send back some error code that makes Session not retry?
723-
raise PostRejected("filtration rejected message")
740+
raise PostRejected("filtration rejected profane message")
724741
return False
725742

726743
def _own_files(self, msg_id: int, files: List[int], user):

0 commit comments

Comments
 (0)