Skip to content

Commit dc683c4

Browse files
committed
Merge pull request #7 from Alir3z4/kissarat-master
LANGUAGE_MAPPING Thanks @kissarat @@dmiro
2 parents ab10703 + 8b5c93c commit dc683c4

File tree

8 files changed

+46
-33
lines changed

8 files changed

+46
-33
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ dist/
1313
logs/
1414
src/
1515
.c9/
16+
bin/
17+
develop-eggs/
18+
eggs/

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ python:
88
install:
99
- git submodule init
1010
- git submodule update
11+
- git submodule foreach git pull origin master
1112
- pip install -U setuptools coveralls
1213
- python bootstrap.py
1314
- ./bin/buildout

AUTHORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ answer newbie questions, and generally made ``python-stop-words`` that much bett
1313
* Alireza Savand <alireza.savand@gmail.com>
1414
* Julien Fache <fantomas42@gmail.com>
1515
* David Miró <lite.3engine@gmail.com>
16+
* Taras Labiak <kissarat@gmail.com>
1617

1718

1819
A big THANK YOU goes to:

ChangeLog.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
2015.2.21
2+
=========
3+
----
4+
5+
* Feature: ``LANGUAGE_MAPPING`` is loads from stop-words/languages.json
6+
* Fix: Made paths OS-independent
7+
8+
19
2015.1.31
210
=========
311
----

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ Available languages
6767
* Spanish
6868
* Swedish
6969
* Turkish
70-
70+
* Ukrainian
7171

7272
Installation
7373
------------

stop_words/__init__.py

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,15 @@
1+
import json
12
import os
23

3-
__VERSION__ = (2015, 1, 31)
4+
__VERSION__ = (2015, 2, 21)
45
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
5-
STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words/')
6+
STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words')
67
STOP_WORDS_CACHE = {}
78

8-
LANGUAGE_MAPPING = {
9-
'ar': 'arabic',
10-
'ca': 'catalan',
11-
'da': 'danish',
12-
'nl': 'dutch',
13-
'en': 'english',
14-
'fi': 'finnish',
15-
'fr': 'french',
16-
'de': 'german',
17-
'hu': 'hungarian',
18-
'it': 'italian',
19-
'nb': 'norwegian',
20-
'pt': 'portuguese',
21-
'ro': 'romanian',
22-
'ru': 'russian',
23-
'es': 'spanish',
24-
'sv': 'swedish',
25-
'tr': 'turkish',
26-
}
9+
with open(os.path.join(STOP_WORDS_DIR, 'languages.json'), 'rb') as map_file:
10+
buffer = map_file.read()
11+
buffer = buffer.decode('ascii')
12+
LANGUAGE_MAPPING = json.loads(buffer)
2713

2814
AVAILABLE_LANGUAGES = LANGUAGE_MAPPING.values()
2915

@@ -48,23 +34,25 @@ def get_stop_words(language):
4834
try:
4935
language = LANGUAGE_MAPPING[language]
5036
except KeyError:
51-
pass
52-
53-
if language not in AVAILABLE_LANGUAGES:
54-
raise StopWordError('"%s" language is unavailable.' % language)
37+
if language not in AVAILABLE_LANGUAGES:
38+
raise StopWordError('{0}" language is unavailable.'.format(
39+
language
40+
))
5541

5642
if language in STOP_WORDS_CACHE:
5743
return STOP_WORDS_CACHE[language]
5844

45+
language_filename = os.path.join(STOP_WORDS_DIR, language + '.txt')
5946
try:
60-
language_filename = '{0}{1}.txt'.format(STOP_WORDS_DIR, language)
6147
with open(language_filename, 'rb') as language_file:
6248
stop_words = [line.decode('utf-8').strip()
6349
for line in language_file.readlines()]
6450
except IOError:
6551
raise StopWordError(
66-
'"%s" file is unreadable, check your installation.' %
67-
language_filename)
52+
'{0}" file is unreadable, check your installation.'.format(
53+
language_filename
54+
)
55+
)
6856

6957
STOP_WORDS_CACHE[language] = stop_words
7058

stop_words/tests.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
"""
22
Tests for stop-words
33
"""
4+
import random
45
from unittest import TestCase
56
from unittest import TestSuite
67
from unittest import TestLoader
78

89
import stop_words
9-
from stop_words import StopWordError
1010
from stop_words import get_stop_words
1111
from stop_words import safe_get_stop_words
12+
from stop_words import StopWordError
1213
from stop_words import STOP_WORDS_CACHE
14+
from stop_words import LANGUAGE_MAPPING
15+
from stop_words import AVAILABLE_LANGUAGES
1316

1417

1518
class StopWordsTestCase(TestCase):
@@ -29,7 +32,7 @@ def test_get_stop_words_cache(self):
2932
sw = get_stop_words('fr')
3033
self.assertTrue('french' in STOP_WORDS_CACHE)
3134
original_stop_words_dir = stop_words.STOP_WORDS_DIR
32-
stop_words.STOP_WORDS_DIR = '/trash/'
35+
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
3336
self.assertEqual(sw, get_stop_words('french'))
3437
stop_words.STOP_WORDS_DIR = original_stop_words_dir
3538
try:
@@ -43,14 +46,23 @@ def test_get_stop_words_unavailable_language(self):
4346

4447
def test_get_stop_words_install_issue(self):
4548
original_stop_words_dir = stop_words.STOP_WORDS_DIR
46-
stop_words.STOP_WORDS_DIR = '/trash/'
49+
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
4750
self.assertRaises(StopWordError, get_stop_words, 'german')
4851
stop_words.STOP_WORDS_DIR = original_stop_words_dir
4952

5053
def test_safe_get_stop_words(self):
5154
self.assertRaises(StopWordError, get_stop_words, 'huttese')
5255
self.assertEqual(safe_get_stop_words('huttese'), [])
5356

57+
def test_random_language_stop_words_load(self):
58+
languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES)
59+
sample = random.sample(languages, len(languages))
60+
for language in sample:
61+
stop_words = safe_get_stop_words(language)
62+
self.assertTrue(
63+
len(stop_words) > 0,
64+
'Cannot load stopwords for {0} language'.format(language)
65+
)
5466

5567
loader = TestLoader()
5668

0 commit comments

Comments
 (0)