Merge pull request #387 from stephenhky/develop

stephenhky · web-flow · commit 1b84ed69e70e · 2025-05-08T20:01:06.000-04:00
Release 2.1.1
diff --git a/README.md b/README.md
@@ -83,6 +83,7 @@ If you would like to contribute, feel free to submit the pull requests. You can
 
 ## News
 
+* 05/08/2025: `shorttext` 2.1.1 released.
 * 12/14/2024: `shorttext` 2.1.0 released.
 * 07/12/2024: `shorttext` 2.0.0 released.
 * 12/21/2023: `shorttext` 1.6.1 released.
diff --git a/docs/conf.py b/docs/conf.py
@@ -58,7 +58,7 @@
 # The short X.Y version.
 version = u'2.1'
 # The full version, including alpha/beta/rc tags.
-release = u'2.1.0'
+release = u'2.1.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/news.rst b/docs/news.rst
@@ -1,6 +1,7 @@
 News
 ====
 
+* 05/08/2025: `shorttext` 2.1.1 released.
 * 12/14/2024: `shorttext` 2.1.0 released.
 * 07/12/2024: `shorttext` 2.0.0 released.
 * 12/21/2023: `shorttext` 1.6.1 released.
@@ -82,27 +83,33 @@ News
 What's New
 ----------
 
-Released 2.1.0 (December 14, 2024)
-------------------------------
+Release 2.1.1 (May 8, 2025)
+---------------------------
+
+* Update of Snowball stemmer;
+* Codes cleaned up.
+
+Release 2.1.0 (December 14, 2024)
+---------------------------------
 
 * Use of `pyproject.toml` for package distribution.
 * Removed Cython components.
 * Huge relative import refactoring.
 
-Released 2.0.0 (July 13, 2024)
-------------------------------
+Release 2.0.0 (July 13, 2024)
+-----------------------------
 
 * Decommissioned support for Python 3.8.
 * Added support for Python 3.12.
 * Updated file extensions for model files.
 
-Released 1.6.1 (December 21, 2023)
-----------------------------------
+Release 1.6.1 (December 21, 2023)
+---------------------------------
 
 * Updated package requirements.
 
-Released 1.6.0 (August 26, 2023)
---------------------------------
+Release 1.6.0 (August 26, 2023)
+-------------------------------
 
 * Pinned requirements for ReadTheDocs documentation;
 * Fixed bugs in word-embedding model mean pooling classifiers;
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "shorttext"
-version = "2.1.1a1"
+version = "2.1.1"
 authors = [
     {name = "Kwan Yuet Stephen Ho", email = "stephenhky@yahoo.com.hk"}
 ]
@@ -38,7 +38,7 @@ dependencies = [
     "keras>=2.13.0",
     "gensim>=4.0.0",
     "pandas>=1.2.0",
-    "snowballstemmer>=2.0.0",
+    "snowballstemmer>=3.0.0",
     "transformers>=4.39.0",
     "torch>=2.0.0",
     "numba>=0.57.0",
diff --git a/shorttext/stack/stacking.py b/shorttext/stack/stacking.py
@@ -27,13 +27,13 @@ class StackedGeneralization(ABC):
     M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization,"
     *WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015).
     """
-    def __init__(self, intermediate_classifiers={}):
+    def __init__(self, intermediate_classifiers=None):
         """ Initialize the stacking class instance.
 
         :param intermediate_classifiers: dictionary, with key being a string, and the values intermediate classifiers, that have the method :func:`~score`, which takes a string as the input argument.
         :type intermediate_classifiers: dict
         """
-        self.classifiers = intermediate_classifiers
+        self.classifiers = intermediate_classifiers if intermediate_classifiers is not None else {}
         self.classlabels = []
         self.trained = False
 
diff --git a/shorttext/utils/compactmodel_io.py b/shorttext/utils/compactmodel_io.py
@@ -16,7 +16,7 @@
 from deprecation import deprecated
 
 
-def removedir(dir):
+def removedir(dir: str):
     """ Remove all subdirectories and files under the specified path.
 
     :param dir: path of the directory to be clean
@@ -25,9 +25,9 @@ def removedir(dir):
     for filename in os.listdir(dir):
         if os.path.isdir(filename):
             removedir(os.path.join(dir, filename))
-            os.rmdir(os.path.isdir(filename))
+            os.rmdir(os.path.join(filename))
         else:
-            os.remove(dir+'/'+filename)
+            os.remove(os.path.join(dir, filename))
     os.rmdir(dir)
 
 
diff --git a/shorttext/utils/textpreprocessing.py b/shorttext/utils/textpreprocessing.py
@@ -6,12 +6,23 @@
 import snowballstemmer
 
 # tokenizer
-tokenize = lambda s: s.split(' ')
+def tokenize(s: str) -> list[str]:
+    return s.split(' ')
 
 
 # stemmer
-stemmer = snowballstemmer.stemmer('porter')
-stemword = lambda s: stemmer.stemWord(s)
+class StemmerSingleton:
+    def __new__(cls):
+        if not hasattr(cls, 'instance'):
+            cls.instance = super(StemmerSingleton, cls).__new__(cls)
+            cls.stemmer = snowballstemmer.stemmer('english')
+        return cls.instance
+
+    def __call__(cls, s: str) -> str:
+        return cls.stemmer.stemWord(s)
+
+def stemword(s: str) -> str:
+    return StemmerSingleton()(s)
 
 
 def preprocess_text(text, pipeline):
diff --git a/test/test_bertrepresentations.py b/test/test_bertrepresentations.py
diff --git a/test/test_charonehot.py b/test/test_charonehot.py
@@ -6,15 +6,11 @@
 
 
 class TestCharOneHot(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
-
     def test_BigTxt(self):
-        chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt'),
-                                                                              encoding='utf-8')
+        chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(
+            urlopen('http://norvig.com/big.txt'),
+            encoding='utf-8'
+        )
         self.assertEqual(93, len(chartovec_encoder.dictionary))
         self.assertEqual('\n', chartovec_encoder.signalchar)
 
diff --git a/test/test_dtm.py b/test/test_dtm.py
@@ -32,14 +32,14 @@ def test_inaugural(self):
         dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)
 
         # check results
-        self.assertEqual(len(dtm.dictionary), 5406)
-        self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.013801565936022027,
-                               places=4)
+        self.assertEqual(len(dtm.dictionary), 5256)
+        self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.0138,
+                               places=3)
         numdocs, numtokens = dtm.dtm.shape
         self.assertEqual(numdocs, 56)
-        self.assertEqual(numtokens, 5406)
-        self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27584786568258396,
-                               places=4)
+        self.assertEqual(numtokens, 5256)
+        self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27865372986738407,
+                               places=3)
 
 
 if __name__ == '__main__':
diff --git a/test/test_fuzzylogic.py b/test/test_fuzzylogic.py
@@ -3,13 +3,8 @@
 
 import shorttext
 
-class TestFuzzyLogic(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
 
+class TestFuzzyLogic(unittest.TestCase):
     def test_similarity(self):
         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('debug', 'deubg'), 1)
         self.assertEqual(shorttext.metrics.dynprog.dldist.damerau_levenshtein('intrdependence', 'interdpeendencae'), 3)
diff --git a/test/test_norvigspell.py b/test/test_norvigspell.py
@@ -10,14 +10,12 @@ def setUp(self):
         self.text = urlopen('http://norvig.com/big.txt').read()
         self.text = self.text.decode('utf-8')
 
-    def tearDown(self):
-        pass
-
     def test_norvig(self):
         speller = shorttext.spell.NorvigSpellCorrector()
         speller.train(self.text)
         self.assertEqual(speller.correct('apple'), 'apple')
         self.assertEqual(speller.correct('appl'), 'apply')
 
+
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
diff --git a/test/test_sakaguchispell.py b/test/test_sakaguchispell.py
@@ -5,13 +5,8 @@
 from shorttext.spell.sakaguchi import SCRNNSpellCorrector
 from shorttext.smartload import smartload_compact_model
 
-class TestSCRNN(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
 
+class TestSCRNN(unittest.TestCase):
     def generalproc(self, operation, typo='langudge', recommendation='language'):
         corrector = SCRNNSpellCorrector(operation)
         corrector.train('I am a nerd . Natural language processing is sosad .')
diff --git a/test/test_stacking.py b/test/test_stacking.py
@@ -7,6 +7,7 @@
 from shorttext.smartload import smartload_compact_model
 from sklearn.svm import SVC
 
+
 class TestStacking(unittest.TestCase):
     def setUp(self):
         self.nihdict = shorttext.data.nihreports(sample_size=None)
diff --git a/test/test_textpreprocessing.py b/test/test_textpreprocessing.py
@@ -4,12 +4,6 @@
 import shorttext
 
 class TestTextPreprocessing(unittest.TestCase):
-    def setUp(self):
-        pass
-
-    def tearDown(self):
-        pass
-
     def testStandardPipeline(self):
         preprocessor = shorttext.utils.standard_text_preprocessor_1()
         self.assertEqual(preprocessor('I love you.'), 'love')
diff --git a/test/test_var_nn_embedded_vec_classifier.py b/test/test_var_nn_embedded_vec_classifier.py
@@ -1,3 +1,4 @@
+
 import os
 import unittest
 import urllib