Skip to content
This repository was archived by the owner on Aug 9, 2024. It is now read-only.

Commit 110c1c0

Browse files
Merge pull request #129 from wiki-ai/english_regex
Converts English language utilities to Regex style and fixes minor issues
2 parents 7bbe7c2 + 702b865 commit 110c1c0

File tree

3 files changed

+37
-102
lines changed

3 files changed

+37
-102
lines changed

revscoring/languages/english.py

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import warnings
23

34
import enchant
@@ -8,13 +9,12 @@
89

910
STEMMER = SnowballStemmer("english")
1011
STOPWORDS = set(stopwords.words('english'))
11-
BADWORDS = set([
12-
"anus", "ass",
13-
"bitch", "bootlip", "butt",
12+
BAD_REGEXES = [
13+
"a+nus+", "ass+",
14+
"bitch", "bootlip", "butt+",
1415
"chlamydia", "cholo", "chug", "cocksuck", "coonass", "cracker", "cunt",
15-
"dick", "dickhead", "dothead",
16-
"fag", "faggot",
17-
"fart", "fat", "fuck", "fucker",
16+
"dick", "dothead",
17+
"(f|ph)ag+(ot)?", "fart", "fat", "fuck",
1818
"gipp", "gippo", "gonorrhea", "gook", "gringo", "gypo", "gyppie", "gyppo",
1919
"gyppy",
2020
"herpes", "hillbilly", "hiv", "homosexual", "hori",
@@ -23,70 +23,72 @@
2323
"kike", "kwashi", "kyke",
2424
"lesbian", "lick",
2525
"motherfuck",
26-
"nig", "nigar", "nigette", "nigga", "niggah", "niggar", "nigger",
27-
"niggress", "nigguh", "niggur", "niglet", "nigor", "nigr", "nigra",
28-
"peckerwood", "penis", "piss",
26+
"nig", "nig+(a|e|u)+(r|h)+", "niggress"
27+
"niglet", "nigor", "nigr", "nigra",
28+
"pecker(wood)?", "peni(s)?", "piss",
2929
"quashi",
3030
"raghead", "redneck", "redskin", "roundeye",
31-
"scabies", "shit", "shitty", "slut", "slutty", "spic", "spick", "spig",
32-
"spigotty", "spik", "spook", "squarehead", "stupid", "suck", "syphilis",
31+
"scabies", "shi+t+", "slut", "spi(g|c|k)+",
32+
"spigotty", "spik", "spook", "squarehead", "st(u|oo+)pid", "suck",
33+
"syphil+is",
3334
"turd", "twat",
3435
"wank", "wetback", "whore", "wog", "wop",
3536
"yank", "yankee", "yid",
3637
"zipperhead"
37-
])
38-
INFORMAL_WORDS = set([
39-
'awesome', 'awesomest', 'awsome'
40-
'bla', 'blah', 'boner', 'boobs', 'bullshit'
41-
'cant', 'coolest', 'crap'
42-
"dont", "dumb", "dumbass",
38+
]
39+
INFORMAL_REGEXES = [
40+
'awesome', 'awesomest', 'awsome',
41+
'bla', 'blah', 'boner', 'boobs', 'bullshit',
42+
'cant', 'coolest', 'crap',
43+
"don'?t", "dumb", "dumbass",
4344
"haha", "hello", "hey",
4445
"kool",
4546
"lol", "luv",
4647
"meow",
47-
'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux'
48+
'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux', "shouldn\'t"
4849
"tits",
49-
"wuz",
50-
'yall', 'yay', 'yea', 'yolo'])
51-
STEMMED_BADWORDS = set(STEMMER.stem(w) for w in BADWORDS)
50+
"wasn'?t", "wuz", "won'?t",
51+
'yall', 'yay', 'yea', 'yolo'
52+
]
53+
BAD_REGEX = re.compile("|".join(BAD_REGEXES))
54+
INFORMAL_REGEX = re.compile("|".join(INFORMAL_REGEXES))
5255
DICTIONARY = enchant.Dict("en")
5356

5457

5558
def stem_word_process():
5659
def stem_word(word):
5760
return STEMMER.stem(word).lower()
5861
return stem_word
59-
stem_word = LanguageUtility("stem_word", stem_word_process, depends_on=[])
62+
stem_word = LanguageUtility("stem_word", stem_word_process)
6063

6164

62-
def is_badword_process(stem_word):
65+
def is_badword_process():
6366
def is_badword(word):
64-
return stem_word(word) in STEMMED_BADWORDS
67+
return bool(BAD_REGEX.match(word.lower()))
6568
return is_badword
66-
is_badword = LanguageUtility("is_badword", is_badword_process, depends_on=[stem_word])
69+
is_badword = LanguageUtility("is_badword", is_badword_process)
6770

6871

69-
def is_informal_word_process(stem_word):
72+
def is_informal_word_process():
7073
def is_informal_word(word):
71-
return stem_word(word) in INFORMAL_WORDS
74+
return bool(INFORMAL_REGEX.match(word.lower()))
7275
return is_informal_word
7376
is_informal_word = LanguageUtility("is_informal_word",
74-
is_informal_word_process, depends_on=[stem_word])
77+
is_informal_word_process)
7578

7679

7780
def is_misspelled_process():
7881
def is_misspelled(word):
7982
return not DICTIONARY.check(word)
8083
return is_misspelled
8184

82-
is_misspelled = LanguageUtility("is_misspelled", is_misspelled_process,
83-
depends_on=[])
85+
is_misspelled = LanguageUtility("is_misspelled", is_misspelled_process)
8486

8587
def is_stopword_process():
8688
def is_stopword(word):
8789
return word.lower() in STOPWORDS
8890
return is_stopword
89-
is_stopword = LanguageUtility("is_stopword", is_stopword_process, depends_on=[])
91+
is_stopword = LanguageUtility("is_stopword", is_stopword_process)
9092

9193
english = Language("revscoring.languages.english",
9294
[stem_word, is_badword, is_misspelled, is_stopword, is_informal_word])

revscoring/languages/english.regex.py

Lines changed: 0 additions & 67 deletions
This file was deleted.

revscoring/languages/tests/test_english.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ def test_language():
1010
eq_(stem_word()("Shitting"), "shit")
1111
eq_(hash(stem_word), hash(language.stem_word))
1212

13-
assert is_badword(stem_word())("shit")
14-
assert is_badword(stem_word())("shitty")
15-
assert is_badword(stem_word())("Shitty")
16-
assert not is_badword(stem_word())("hat")
13+
assert is_badword()("shit")
14+
assert is_badword()("shitty")
15+
assert is_badword()("Shitty")
16+
assert not is_badword()("hat")
1717
eq_(hash(is_badword), hash(language.is_badword))
1818

1919
assert is_misspelled()("wjwkjb")

0 commit comments

Comments
 (0)