feat: support extra stopwords

soumendrak · soumendrak · commit 0e5226e7d3ed · 2025-06-28T20:35:47.000+05:30
diff --git a/README.md b/README.md
@@ -30,7 +30,15 @@ python setup.py install
 
 ## Usage and Documentation
 
-For usage and further documentation please visit the [Documentation](https://openodia.soumendrak.com/) page. 
+For usage and further documentation please visit the [Documentation](https://openodia.soumendrak.com/) page.
+
+### Example
+
+```python
+from openodia import ud
+ud.remove_stopwords("ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ ଦେଇଛନ୍ତି", extra_stopwords=["ଆଶୀର୍ବାଦ"])
+# ['ରାମ', 'ସୀତା', 'ଆମକୁ']
+```
 
 ## License
 
diff --git a/docs/index.md b/docs/index.md
@@ -222,6 +222,7 @@ ud.sentence_tokenizer()
 - Frequently occurring words in a language are called as _stopwords_. Using the below function you can remove the stopwords.
 - Internally this method calls the `word_tokenize` method to get tokens from the text.
 - As most of the time processing happens in list by default a list of strings will be returned.
+- You can also pass a list of extra stopwords to filter out words of your choice.
 
 ```python
 from openodia import ud
@@ -234,6 +235,11 @@ ud.remove_stopwords("ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବ
 '''
 'ରାମ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ'
 '''
+
+ud.remove_stopwords("ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ ଦେଇଛନ୍ତି", extra_stopwords=["ଆଶୀର୍ବାଦ"])
+'''
+['ରାମ', 'ସୀତା', 'ଆମକୁ']
+'''
 ```
 Here the stopwords `ଓ` and `ଦେଇଛନ୍ତି` are removed from the text.
 
diff --git a/openodia/_understandData.py b/openodia/_understandData.py
@@ -25,13 +25,23 @@ def sentence_tokenizer(cls, text):
         return sent_list
 
     @classmethod
-    def remove_stopwords(cls, text: Union[str, List[str]], get_str: bool = False) -> Union[List[str], str]:
-        """Remove frequently used words from the text
-        :param text: It can take both tokens and text string as input
-        :param get_str: provide whether the output needed on str or list
+    def remove_stopwords(
+        cls,
+        text: Union[str, List[str]],
+        get_str: bool = False,
+        extra_stopwords: List[str] | None = None,
+    ) -> Union[List[str], str]:
+        """Remove frequently used words from the text.
+
+        :param text: It can take both tokens and text string as input.
+        :param get_str: Set ``True`` to get string output instead of list.
+        :param extra_stopwords: Additional stopwords provided by user.
         """
         token_list: List[str] = cls.word_tokenizer(text) if isinstance(text, str) else text
-        cleaned_tokens = [token for token in token_list if token not in STOPWORDS]
+        stopwords = set(STOPWORDS)
+        if extra_stopwords:
+            stopwords.update(extra_stopwords)
+        cleaned_tokens = [token for token in token_list if token not in stopwords]
         return " ".join(cleaned_tokens) if get_str else cleaned_tokens
 
     @classmethod
diff --git a/tests/test_understandData.py b/tests/test_understandData.py
@@ -118,6 +118,13 @@ def test_remove_stopwords_with_list_input(self):
         expected = ["ରାମ", "ସୀତା", "ଆମକୁ"]
         assert result == expected
 
+    def test_remove_stopwords_with_extra_stopwords(self):
+        """Test remove_stopwords with user supplied stopwords"""
+        text = "ରାମ ଓ ସୀତା ଆମକୁ ଆଶୀର୍ବାଦ ଦେଇଛନ୍ତି"
+        result = ud.remove_stopwords(text, extra_stopwords=["ଆମକୁ"])
+        assert "ଆମକୁ" not in result
+        assert result == ["ରାମ", "ସୀତା", "ଆଶୀର୍ବାଦ"]
+
     def test_remove_stopwords_empty_string(self):
         """Test remove_stopwords with empty string"""
         assert ud.remove_stopwords("") == []