Skip to content

Commit 1b84ed6

Browse files
authored
Merge pull request #387 from stephenhky/develop
Release 2.1.1
2 parents 7a654e0 + 05136ce commit 1b84ed6

16 files changed

+54
-100
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ If you would like to contribute, feel free to submit the pull requests. You can
8383

8484
## News
8585

86+
* 05/08/2025: `shorttext` 2.1.1 released.
8687
* 12/14/2024: `shorttext` 2.1.0 released.
8788
* 07/12/2024: `shorttext` 2.0.0 released.
8889
* 12/21/2023: `shorttext` 1.6.1 released.

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
# The short X.Y version.
5959
version = u'2.1'
6060
# The full version, including alpha/beta/rc tags.
61-
release = u'2.1.0'
61+
release = u'2.1.1'
6262

6363
# The language for content autogenerated by Sphinx. Refer to documentation
6464
# for a list of supported languages.

docs/news.rst

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
News
22
====
33

4+
* 05/08/2025: `shorttext` 2.1.1 released.
45
* 12/14/2024: `shorttext` 2.1.0 released.
56
* 07/12/2024: `shorttext` 2.0.0 released.
67
* 12/21/2023: `shorttext` 1.6.1 released.
@@ -82,27 +83,33 @@ News
8283
What's New
8384
----------
8485

85-
Released 2.1.0 (December 14, 2024)
86-
------------------------------
86+
Release 2.1.1 (May 8, 2025)
87+
---------------------------
88+
89+
* Update of Snowball stemmer;
90+
* Codes cleaned up.
91+
92+
Release 2.1.0 (December 14, 2024)
93+
---------------------------------
8794

8895
* Use of `pyproject.toml` for package distribution.
8996
* Removed Cython components.
9097
* Huge relative import refactoring.
9198

92-
Released 2.0.0 (July 13, 2024)
93-
------------------------------
99+
Release 2.0.0 (July 13, 2024)
100+
-----------------------------
94101

95102
* Decommissioned support for Python 3.8.
96103
* Added support for Python 3.12.
97104
* Updated file extensions for model files.
98105

99-
Released 1.6.1 (December 21, 2023)
100-
----------------------------------
106+
Release 1.6.1 (December 21, 2023)
107+
---------------------------------
101108

102109
* Updated package requirements.
103110

104-
Released 1.6.0 (August 26, 2023)
105-
--------------------------------
111+
Release 1.6.0 (August 26, 2023)
112+
-------------------------------
106113

107114
* Pinned requirements for ReadTheDocs documentation;
108115
* Fixed bugs in word-embedding model mean pooling classifiers;

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "shorttext"
7-
version = "2.1.1a1"
7+
version = "2.1.1"
88
authors = [
99
{name = "Kwan Yuet Stephen Ho", email = "[email protected]"}
1010
]
@@ -38,7 +38,7 @@ dependencies = [
3838
"keras>=2.13.0",
3939
"gensim>=4.0.0",
4040
"pandas>=1.2.0",
41-
"snowballstemmer>=2.0.0",
41+
"snowballstemmer>=3.0.0",
4242
"transformers>=4.39.0",
4343
"torch>=2.0.0",
4444
"numba>=0.57.0",

shorttext/stack/stacking.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ class StackedGeneralization(ABC):
2727
M. Paz Sesmero, Agapito I. Ledezma, Araceli Sanchis, "Generating ensembles of heterogeneous classifiers using Stacked Generalization,"
2828
*WIREs Data Mining and Knowledge Discovery* 5: 21-34 (2015).
2929
"""
30-
def __init__(self, intermediate_classifiers={}):
30+
def __init__(self, intermediate_classifiers=None):
3131
""" Initialize the stacking class instance.
3232
3333
:param intermediate_classifiers: dictionary, with key being a string, and the values intermediate classifiers, that have the method :func:`~score`, which takes a string as the input argument.
3434
:type intermediate_classifiers: dict
3535
"""
36-
self.classifiers = intermediate_classifiers
36+
self.classifiers = intermediate_classifiers if intermediate_classifiers is not None else {}
3737
self.classlabels = []
3838
self.trained = False
3939

shorttext/utils/compactmodel_io.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from deprecation import deprecated
1717

1818

19-
def removedir(dir):
19+
def removedir(dir: str):
2020
""" Remove all subdirectories and files under the specified path.
2121
2222
:param dir: path of the directory to be clean
@@ -25,9 +25,9 @@ def removedir(dir):
2525
for filename in os.listdir(dir):
2626
if os.path.isdir(filename):
2727
removedir(os.path.join(dir, filename))
28-
os.rmdir(os.path.isdir(filename))
28+
os.rmdir(os.path.join(filename))
2929
else:
30-
os.remove(dir+'/'+filename)
30+
os.remove(os.path.join(dir, filename))
3131
os.rmdir(dir)
3232

3333

shorttext/utils/textpreprocessing.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,23 @@
66
import snowballstemmer
77

88
# tokenizer
9-
tokenize = lambda s: s.split(' ')
9+
def tokenize(s: str) -> list[str]:
10+
return s.split(' ')
1011

1112

1213
# stemmer
13-
stemmer = snowballstemmer.stemmer('porter')
14-
stemword = lambda s: stemmer.stemWord(s)
14+
class StemmerSingleton:
15+
def __new__(cls):
16+
if not hasattr(cls, 'instance'):
17+
cls.instance = super(StemmerSingleton, cls).__new__(cls)
18+
cls.stemmer = snowballstemmer.stemmer('english')
19+
return cls.instance
20+
21+
def __call__(cls, s: str) -> str:
22+
return cls.stemmer.stemWord(s)
23+
24+
def stemword(s: str) -> str:
25+
return StemmerSingleton()(s)
1526

1627

1728
def preprocess_text(text, pipeline):

test/test_bertrepresentations.py

Lines changed: 0 additions & 45 deletions
This file was deleted.

test/test_charonehot.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,11 @@
66

77

88
class TestCharOneHot(unittest.TestCase):
9-
def setUp(self):
10-
pass
11-
12-
def tearDown(self):
13-
pass
14-
159
def test_BigTxt(self):
16-
chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(urlopen('http://norvig.com/big.txt'),
17-
encoding='utf-8')
10+
chartovec_encoder = shorttext.generators.initSentenceToCharVecEncoder(
11+
urlopen('http://norvig.com/big.txt'),
12+
encoding='utf-8'
13+
)
1814
self.assertEqual(93, len(chartovec_encoder.dictionary))
1915
self.assertEqual('\n', chartovec_encoder.signalchar)
2016

test/test_dtm.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,14 @@ def test_inaugural(self):
3232
dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)
3333

3434
# check results
35-
self.assertEqual(len(dtm.dictionary), 5406)
36-
self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.013801565936022027,
37-
places=4)
35+
self.assertEqual(len(dtm.dictionary), 5256)
36+
self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.0138,
37+
places=3)
3838
numdocs, numtokens = dtm.dtm.shape
3939
self.assertEqual(numdocs, 56)
40-
self.assertEqual(numtokens, 5406)
41-
self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27584786568258396,
42-
places=4)
40+
self.assertEqual(numtokens, 5256)
41+
self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27865372986738407,
42+
places=3)
4343

4444

4545
if __name__ == '__main__':

0 commit comments

Comments
 (0)