|
| 1 | +import re |
1 | 2 | import warnings |
2 | 3 |
|
3 | 4 | import enchant |
|
8 | 9 |
|
9 | 10 | STEMMER = SnowballStemmer("english") |
10 | 11 | STOPWORDS = set(stopwords.words('english')) |
11 | | -BADWORDS = set([ |
12 | | - "anus", "ass", |
13 | | - "bitch", "bootlip", "butt", |
| 12 | +BAD_REGEXES = [ |
| 13 | + "a+nus+", "ass+", |
| 14 | + "bitch", "bootlip", "butt+", |
14 | 15 | "chlamydia", "cholo", "chug", "cocksuck", "coonass", "cracker", "cunt", |
15 | | - "dick", "dickhead", "dothead", |
16 | | - "fag", "faggot", |
17 | | - "fart", "fat", "fuck", "fucker", |
| 16 | + "dick", "dothead", |
| 17 | + "(f|ph)ag+(ot)?", "fart", "fat", "fuck", |
18 | 18 | "gipp", "gippo", "gonorrhea", "gook", "gringo", "gypo", "gyppie", "gyppo", |
19 | 19 | "gyppy", |
20 | 20 | "herpes", "hillbilly", "hiv", "homosexual", "hori", |
|
23 | 23 | "kike", "kwashi", "kyke", |
24 | 24 | "lesbian", "lick", |
25 | 25 | "motherfuck", |
26 | | - "nig", "nigar", "nigette", "nigga", "niggah", "niggar", "nigger", |
27 | | - "niggress", "nigguh", "niggur", "niglet", "nigor", "nigr", "nigra", |
28 | | - "peckerwood", "penis", "piss", |
| 26 | + "nig", "nig+(a|e|u)+(r|h)+", "niggress" |
| 27 | + "niglet", "nigor", "nigr", "nigra", |
| 28 | + "pecker(wood)?", "peni(s)?", "piss", |
29 | 29 | "quashi", |
30 | 30 | "raghead", "redneck", "redskin", "roundeye", |
31 | | - "scabies", "shit", "shitty", "slut", "slutty", "spic", "spick", "spig", |
32 | | - "spigotty", "spik", "spook", "squarehead", "stupid", "suck", "syphilis", |
| 31 | + "scabies", "shi+t+", "slut", "spi(g|c|k)+", |
| 32 | + "spigotty", "spik", "spook", "squarehead", "st(u|oo+)pid", "suck", |
| 33 | + "syphil+is", |
33 | 34 | "turd", "twat", |
34 | 35 | "wank", "wetback", "whore", "wog", "wop", |
35 | 36 | "yank", "yankee", "yid", |
36 | 37 | "zipperhead" |
37 | | -]) |
38 | | -INFORMAL_WORDS = set([ |
39 | | - 'awesome', 'awesomest', 'awsome' |
40 | | - 'bla', 'blah', 'boner', 'boobs', 'bullshit' |
41 | | - 'cant', 'coolest', 'crap' |
42 | | - "dont", "dumb", "dumbass", |
| 38 | +] |
| 39 | +INFORMAL_REGEXES = [ |
| 40 | + 'awesome', 'awesomest', 'awsome', |
| 41 | + 'bla', 'blah', 'boner', 'boobs', 'bullshit', |
| 42 | + 'cant', 'coolest', 'crap', |
| 43 | + "don'?t", "dumb", "dumbass", |
43 | 44 | "haha", "hello", "hey", |
44 | 45 | "kool", |
45 | 46 | "lol", "luv", |
46 | 47 | "meow", |
47 | | - 'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux' |
| 48 | + 'shove', 'smelly', 'sooo', 'stinky', 'sucking', 'sux', "shouldn\'t" |
48 | 49 | "tits", |
49 | | - "wuz", |
50 | | - 'yall', 'yay', 'yea', 'yolo']) |
51 | | -STEMMED_BADWORDS = set(STEMMER.stem(w) for w in BADWORDS) |
| 50 | + "wasn'?t", "wuz", "won'?t", |
| 51 | + 'yall', 'yay', 'yea', 'yolo' |
| 52 | +] |
| 53 | +BAD_REGEX = re.compile("|".join(BAD_REGEXES)) |
| 54 | +INFORMAL_REGEX = re.compile("|".join(INFORMAL_REGEXES)) |
52 | 55 | DICTIONARY = enchant.Dict("en") |
53 | 56 |
|
54 | 57 |
|
55 | 58 | def stem_word_process(): |
56 | 59 | def stem_word(word): |
57 | 60 | return STEMMER.stem(word).lower() |
58 | 61 | return stem_word |
59 | | -stem_word = LanguageUtility("stem_word", stem_word_process, depends_on=[]) |
| 62 | +stem_word = LanguageUtility("stem_word", stem_word_process) |
60 | 63 |
|
61 | 64 |
|
62 | | -def is_badword_process(stem_word): |
| 65 | +def is_badword_process(): |
63 | 66 | def is_badword(word): |
64 | | - return stem_word(word) in STEMMED_BADWORDS |
| 67 | + return bool(BAD_REGEX.match(word.lower())) |
65 | 68 | return is_badword |
66 | | -is_badword = LanguageUtility("is_badword", is_badword_process, depends_on=[stem_word]) |
| 69 | +is_badword = LanguageUtility("is_badword", is_badword_process) |
67 | 70 |
|
68 | 71 |
|
69 | | -def is_informal_word_process(stem_word): |
| 72 | +def is_informal_word_process(): |
70 | 73 | def is_informal_word(word): |
71 | | - return stem_word(word) in INFORMAL_WORDS |
| 74 | + return bool(INFORMAL_REGEX.match(word.lower())) |
72 | 75 | return is_informal_word |
73 | 76 | is_informal_word = LanguageUtility("is_informal_word", |
74 | | - is_informal_word_process, depends_on=[stem_word]) |
| 77 | + is_informal_word_process) |
75 | 78 |
|
76 | 79 |
|
77 | 80 | def is_misspelled_process(): |
78 | 81 | def is_misspelled(word): |
79 | 82 | return not DICTIONARY.check(word) |
80 | 83 | return is_misspelled |
81 | 84 |
|
82 | | -is_misspelled = LanguageUtility("is_misspelled", is_misspelled_process, |
83 | | - depends_on=[]) |
| 85 | +is_misspelled = LanguageUtility("is_misspelled", is_misspelled_process) |
84 | 86 |
|
85 | 87 | def is_stopword_process(): |
86 | 88 | def is_stopword(word): |
87 | 89 | return word.lower() in STOPWORDS |
88 | 90 | return is_stopword |
89 | | -is_stopword = LanguageUtility("is_stopword", is_stopword_process, depends_on=[]) |
| 91 | +is_stopword = LanguageUtility("is_stopword", is_stopword_process) |
90 | 92 |
|
91 | 93 | english = Language("revscoring.languages.english", |
92 | 94 | [stem_word, is_badword, is_misspelled, is_stopword, is_informal_word]) |
|
0 commit comments