Skip to content

Commit 0e5226e

Browse files
committed
feat: support extra stopwords
1 parent ac2db15 commit 0e5226e

File tree

4 files changed

+37
-6
lines changed

4 files changed

+37
-6
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,15 @@ python setup.py install
3030

3131
## Usage and Documentation
3232

33-
For usage and further documentation please visit the [Documentation](https://openodia.soumendrak.com/) page.
33+
For usage and further documentation please visit the [Documentation](https://openodia.soumendrak.com/) page.
34+
35+
### Example
36+
37+
```python
38+
from openodia import ud
39+
ud.remove_stopwords("ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ ଦେଇଛନ୍ତି", extra_stopwords=["ଆଶୀର୍ବାଦ"])
40+
# ['ରାମ', 'ସୀତା', 'ଆମକୁ']
41+
```
3442

3543
## License
3644

docs/index.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ ud.sentence_tokenizer()
222222
- Frequently occurring words in a language are called as _stopwords_. Using the below function you can remove the stopwords.
223223
- Internally this method calls the `word_tokenize` method to get tokens from the text.
224224
- As most of the time processing happens in list by default a list of strings will be returned.
225+
- You can also pass a list of extra stopwords to filter out words of your choice.
225226

226227
```python
227228
from openodia import ud
@@ -234,6 +235,11 @@ ud.remove_stopwords("ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବ
234235
'''
235236
'ରାମ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ'
236237
'''
238+
239+
ud.remove_stopwords("ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ ଦେଇଛନ୍ତି", extra_stopwords=["ଆଶୀର୍ବାଦ"])
240+
'''
241+
['ରାମ', 'ସୀତା', 'ଆମକୁ']
242+
'''
237243
```
238244
Here the stopwords `` and `ଦେଇଛନ୍ତି` are removed from the text.
239245

openodia/_understandData.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,23 @@ def sentence_tokenizer(cls, text):
2525
return sent_list
2626

2727
@classmethod
28-
def remove_stopwords(cls, text: Union[str, List[str]], get_str: bool = False) -> Union[List[str], str]:
29-
"""Remove frequently used words from the text
30-
:param text: It can take both tokens and text string as input
31-
:param get_str: provide whether the output needed on str or list
28+
def remove_stopwords(
29+
cls,
30+
text: Union[str, List[str]],
31+
get_str: bool = False,
32+
extra_stopwords: List[str] | None = None,
33+
) -> Union[List[str], str]:
34+
"""Remove frequently used words from the text.
35+
36+
:param text: It can take both tokens and text string as input.
37+
:param get_str: Set ``True`` to get string output instead of list.
38+
:param extra_stopwords: Additional stopwords provided by user.
3239
"""
3340
token_list: List[str] = cls.word_tokenizer(text) if isinstance(text, str) else text
34-
cleaned_tokens = [token for token in token_list if token not in STOPWORDS]
41+
stopwords = set(STOPWORDS)
42+
if extra_stopwords:
43+
stopwords.update(extra_stopwords)
44+
cleaned_tokens = [token for token in token_list if token not in stopwords]
3545
return " ".join(cleaned_tokens) if get_str else cleaned_tokens
3646

3747
@classmethod

tests/test_understandData.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@ def test_remove_stopwords_with_list_input(self):
118118
expected = ["ରାମ", "ସୀତା", "ଆମକୁ"]
119119
assert result == expected
120120

121+
def test_remove_stopwords_with_extra_stopwords(self):
122+
"""Test remove_stopwords with user supplied stopwords"""
123+
text = "ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ ଦେଇଛନ୍ତି"
124+
result = ud.remove_stopwords(text, extra_stopwords=["ଆମକୁ"])
125+
assert "ଆମକୁ" not in result
126+
assert result == ["ରାମ", "ସୀତା", "ଆଶୀର୍ବାଦ"]
127+
121128
def test_remove_stopwords_empty_string(self):
122129
"""Test remove_stopwords with empty string"""
123130
assert ud.remove_stopwords("") == []

0 commit comments

Comments
 (0)