Skip to content
This repository was archived by the owner on Aug 9, 2024. It is now read-only.

Commit 627bc70

Browse files
committed
Merge pull request #206 from wiki-ai/more_languages
Adds Italian, German and Dutch with tests & docs
2 parents 14ba3c5 + e2f94de commit 627bc70

21 files changed

+1091
-340
lines changed

.travis.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ addons:
1111
- libopenblas-dev
1212
- python3-dev
1313
- enchant
14+
- aspell-de
15+
- aspell-nl
16+
- aspell-id
1417
- myspell-en-au
1518
- myspell-en-gb
1619
- myspell-en-us
@@ -19,7 +22,7 @@ addons:
1922
- myspell-fr
2023
- myspell-es
2124
- myspell-he
22-
- aspell-id
25+
- myspell-it
2326
- hunspell-vi
2427
- myspell-pt
2528
before_install:

README.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,12 @@ some NLTK data. The following command will get the necessary corpus.
5151
You'll also need to install [enchant](https://enchant.org) compatible
5252
dictionaries of the languages you'd like to use. We recommend the following:
5353

54+
* ``languages.dutch``: myspell-nl
5455
* ``languages.english``: myspell-en-us myspell-en-gb myspell-en-au
5556
* ``languages.french``: myspell-fr
57+
* ``languages.german``: myspell-de-at myspell-de-ch myspell-de-ch
5658
* ``languages.indonesian``: aspell-id
59+
* ``languages.italian``: myspell-it
5760
* ``languages.hebrew``: myspell-he
5861
* ``languages.portuguese``: myspell-pt
5962
* ``languages.persian``: myspell-fa

revscoring/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,19 @@
7373
collection of language feature sets that work like other features except
7474
that they are language-specific. Language-specific feature sets are
7575
available for the following languages:
76+
:data:`~revscoring.languages.dutch`,
7677
:data:`~revscoring.languages.english`,
7778
:data:`~revscoring.languages.french`,
79+
:data:`~revscoring.languages.german`,
7880
:data:`~revscoring.languages.hebrew`,
7981
:data:`~revscoring.languages.indonesian`,
82+
:data:`~revscoring.languages.italian`,
8083
:data:`~revscoring.languages.persian`,
8184
:data:`~revscoring.languages.portuguese`,
8285
:data:`~revscoring.languages.spanish`,
8386
:data:`~revscoring.languages.turkish`, and
84-
:data:`~revscoring.languages.vietnamese`. See :mod:`revscoring.languages`
87+
:data:`~revscoring.languages.vietnamese`.
88+
See :mod:`revscoring.languages`
8589
8690
Example:
8791

revscoring/languages/__init__.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,29 @@
44
55
languages
66
+++++++++
7+
.. automodule:: revscoring.languages.dutch
8+
79
.. automodule:: revscoring.languages.english
810
911
.. automodule:: revscoring.languages.french
1012
13+
.. automodule:: revscoring.languages.german
14+
1115
.. automodule:: revscoring.languages.hebrew
1216
1317
.. automodule:: revscoring.languages.indonesian
1418
19+
.. automodule:: revscoring.languages.italian
20+
1521
.. automodule:: revscoring.languages.persian
1622
1723
.. automodule:: revscoring.languages.portuguese
1824
1925
.. automodule:: revscoring.languages.spanish
20-
:members:
2126
2227
.. automodule:: revscoring.languages.turkish
23-
:members:
2428
2529
.. automodule:: revscoring.languages.vietnamese
26-
:members:
2730
2831
Base classes
2932
++++++++++++

revscoring/languages/dutch.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import sys
2+
3+
from .space_delimited import SpaceDelimited
4+
5+
try:
6+
from nltk.stem.snowball import SnowballStemmer
7+
stemmer = SnowballStemmer("dutch")
8+
except ValueError:
9+
raise ImportError("Could not load stemmer for {0}. ".format(__name__))
10+
11+
try:
12+
from nltk.corpus import stopwords as nltk_stopwords
13+
stopwords = set(nltk_stopwords.words('dutch'))
14+
except LookupError:
15+
raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
16+
"You may need to install the nltk 'stopwords' " +
17+
"corpora. See http://www.nltk.org/data.html")
18+
19+
try:
20+
import enchant
21+
dictionary = enchant.Dict("nl")
22+
except enchant.errors.DictNotFoundError:
23+
raise ImportError("No enchant-compatible dictionary found for 'nl'. " +
24+
"Consider installing 'myspell-nl'.")
25+
26+
27+
badwords = [
28+
r"aars",
29+
r"an(aal|us)\w*",
30+
r"balhaar",
31+
r"drol(len)?",
32+
r"fack(en|ing|s)?", "facking",
33+
r"flikkers?",
34+
r"focking",
35+
r"ge(ile?|lul)",
36+
r"geneukt",
37+
r"hoer(en?)?",
38+
r"homos?",
39+
r"kaka?",
40+
r"kak(hoofd|ken)",
41+
r"k[ae]nker",
42+
r"klootzak(ken)?",
43+
r"klote",
44+
r"kont(gat|je)?",
45+
r"pedo",
46+
r"penis(sen)?",
47+
r"peop",
48+
r"piemels?",
49+
r"pijpen",
50+
r"pik",
51+
r"pimel",
52+
r"pipi",
53+
r"poep(chinees?|en|hoofd)?",
54+
r"poep(ie|je|sex|te?)s?",
55+
r"porno?",
56+
r"neuke?",
57+
r"neuken(de)?",
58+
r"neukt(en?)?",
59+
r"stron(d|t)",
60+
r"suck(s|t)?",
61+
r"zuigt",
62+
r"sukkels?",
63+
r"ter(ing|ten)", "tetten",
64+
r"tieten",
65+
r"vagina",
66+
r"verekte",
67+
r"verkracht",
68+
r"dikzak",
69+
r"dildo",
70+
r"mon?g(olen|ool)?", "mooiboy",
71+
r"negers?",
72+
r"shit",
73+
r"sperma",
74+
r"kut(jes?)?",
75+
r"stelletje",
76+
r"losers?",
77+
r"lul(len)?",
78+
r"reet",
79+
r"scheet", "scheten", r"schijt",
80+
r"diaree",
81+
r"slet",
82+
r"lekkerding",
83+
r"likken"
84+
]
85+
86+
informals = [
87+
r"aap(jes)?",
88+
r"banaan",
89+
r"bent",
90+
r"boe(it)?",
91+
r"doei"
92+
r"dombo",
93+
r"domme",
94+
r"eigelijk",
95+
r"godverdomme",
96+
r"groetjes",
97+
r"gwn",
98+
r"hoi",
99+
r"hal+o+",
100+
r"heb",
101+
r"hee+[jyl]", r"heee+l",
102+
r"houd?",
103+
r"(hoi+)+",
104+
r"hoor",
105+
r"izan",
106+
r"jij",
107+
r"jou",
108+
r"jullie",
109+
r"kaas",
110+
r"klopt",
111+
r"kots",
112+
r"kusjes",
113+
r"le?kke?re?",
114+
r"maarja",
115+
r"mama",
116+
r"nou",
117+
r"oma",
118+
r"ofzo",
119+
r"oke",
120+
r"sexy?",
121+
r"snap",
122+
r"stink(en|t)",
123+
r"stoer",
124+
r"swag",
125+
r"swek",
126+
r"vies", "vieze",
127+
r"vind",
128+
r"vuile",
129+
r"xxx",
130+
r"yeah",
131+
r"zielig",
132+
r"zooi",
133+
r"yolo",
134+
r"zeg"
135+
]
136+
137+
sys.modules[__name__] = SpaceDelimited(
138+
__name__,
139+
doc="""
140+
dutch
141+
=======
142+
143+
revision
144+
--------
145+
.. autoattribute:: revision.words
146+
.. autoattribute:: revision.content_words
147+
.. autoattribute:: revision.badwords
148+
.. autoattribute:: revision.misspellings
149+
.. autoattribute:: revision.informals
150+
.. autoattribute:: revision.infonoise
151+
152+
parent_revision
153+
---------------
154+
.. autoattribute:: parent_revision.words
155+
.. autoattribute:: parent_revision.content_words
156+
.. autoattribute:: parent_revision.badwords
157+
.. autoattribute:: parent_revision.misspellings
158+
.. autoattribute:: parent_revision.informals
159+
.. autoattribute:: parent_revision.infonoise
160+
161+
diff
162+
----
163+
.. autoattribute:: diff.words_added
164+
.. autoattribute:: diff.words_removed
165+
.. autoattribute:: diff.badwords_added
166+
.. autoattribute:: diff.badwords_removed
167+
.. autoattribute:: diff.misspellings_added
168+
.. autoattribute:: diff.misspellings_removed
169+
.. autoattribute:: diff.informals_added
170+
.. autoattribute:: diff.informals_removed
171+
""",
172+
badwords=badwords,
173+
dictionary=dictionary,
174+
informals=informals,
175+
stemmer=stemmer,
176+
stopwords=stopwords
177+
)

revscoring/languages/english.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,25 +123,33 @@
123123
r"don'?t", r"dum+b*(y|ies|er|est)?(ass)?",
124124
r"d+u+d+e+\w*",
125125
r"good[-_]?bye",
126-
r"h+[aiou]+(h+[aeiou]*)*", r"h+[e]+(h+[aeiou]*)+",
126+
r"(mw?[au]+)?h+[aiou]+(h+[aeiou]*)*", r"h+[e]+(h+[aeiou]*)+",
127127
r"hel+o+", r"h(aa+|e+)y+",
128128
r"h+m+",
129129
r"i", r"i+d+i+o+t+",
130+
r"(la)+",
131+
r"loser",
130132
r"(l+[uo]+l+)([uo]+l+)*",
133+
r"l+m+a+o+",
131134
r"l[ou]+ve?",
132135
r"m+e+o+w+",
133136
r"munch\w*",
134137
r"mom+(y|a)?",
135138
r"moron",
139+
r"nerds?",
140+
r"noo+b(y|ie|s)?\w*",
136141
r"no+pe",
137142
r"o+k+(a+y+)?",
138143
r"\w*o+m+g+\w*",
139144
r"poo+p\w*",
140145
r"\w*retard\w*", r"tard",
146+
r"r+o+f+l+(mao)?",
147+
r"s+e+x+y+",
148+
r"so+rry",
141149
r"shove",
142150
r"smelly",
143151
r"soo+",
144-
r"stinky",
152+
r"stink(s|y)?",
145153
r"\w*s+t+[uo]+p+i+d+\w*",
146154
r"suck(s|ing|er)?", r"sux",
147155
r"shouldn'?t",

0 commit comments

Comments
 (0)