Skip to content

Commit 5eb53ab

Browse files
committed
upgrade
1 parent a848fff commit 5eb53ab

File tree

5 files changed

+110
-98
lines changed

5 files changed

+110
-98
lines changed

contextualized_topic_models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
__author__ = """Federico Bianchi"""
44
__email__ = 'f.bianchi@unibocconi.it'
5-
__version__ = '2.5.0'
5+
__version__ = '2.5.1'

requirements.txt

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
numpy>=1.26
2-
torchvision
3-
torch>=1.6.0
4-
gensim==4.3.2
5-
sentence-transformers>=2.1.1
6-
wordcloud>=1.8.1
7-
matplotlib>=3.1.3
8-
nltk==3.9.1
9-
tqdm>=4.56.0
10-
scipy>=1.4.1,<=1.12.0
11-
ipywidgets==7.5.1
12-
ipython==8.10.0
13-
ipython_genutils
1+
numpy>=1.24.0
2+
torchvision>=0.15.0
3+
torch>=2.0.0
4+
gensim>=4.3.3
5+
sentence-transformers>=2.2.0
6+
wordcloud>=1.9.0
7+
matplotlib>=3.6.0
8+
nltk>=3.8.0
9+
tqdm>=4.64.0
10+
scipy>=1.10.0
11+
ipywidgets>=8.0.0
12+
ipython>=8.12.0
13+
ipython_genutils>=0.2.0

setup.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,15 @@
2020
setup(
2121
author="Federico Bianchi",
2222
author_email='f.bianchi@unibocconi.it',
23-
python_requires='>=3.5',
23+
python_requires='>=3.10',
2424
classifiers=[
2525
'Development Status :: 2 - Pre-Alpha',
2626
'Intended Audience :: Developers',
2727
'License :: OSI Approved :: MIT License',
2828
'Natural Language :: English',
29-
'Programming Language :: Python :: 3.5',
30-
'Programming Language :: Python :: 3.6',
31-
'Programming Language :: Python :: 3.7',
32-
'Programming Language :: Python :: 3.8',
29+
'Programming Language :: Python :: 3.10',
30+
'Programming Language :: Python :: 3.11',
31+
'Programming Language :: Python :: 3.12',
3332
],
3433
description="Contextualized Topic Models",
3534
install_requires=requirements,
@@ -44,6 +43,6 @@
4443
test_suite='tests',
4544
tests_require=test_requirements,
4645
url='https://github.com/MilaNLProc/contextualized-topic-models',
47-
version='2.5.0',
46+
version='2.5.1',
4847
zip_safe=False,
4948
)

tests/test_contextualized_topic_models.py

Lines changed: 52 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -50,25 +50,25 @@ def test_kitty(data_dir):
5050

5151
kt.assigned_classes = {0: "nature", 3: "shop/offices", 4: "sport"}
5252

53-
topic = kt.predict(["test sentence"])
53+
tn = kt.transform(['beautiful sea in the ocean'], labels=['nature', 'shop/offices'])
5454

55-
assert topic[0] in kt.assigned_classes.values()
55+
kt.predict(['beautiful sea in the ocean'], 5)
5656

57-
kt.pretty_print_word_classes()
57+
kt.predict_topic(['beautiful sea in the ocean'], 5)
5858

59+
assert len(tn) == 1
5960

60-
def test_custom_embeddings(data_dir):
6161

62-
with open(data_dir + "/custom_embeddings/sample_text.txt") as filino:
63-
training = filino.read().splitlines()
62+
def test_preprocessing():
6463

65-
embeddings = np.load(data_dir + "/custom_embeddings/sample_embeddings.npy")
64+
testing_data = [" this is some documents \t", " test "]
6665

67-
turkish_stopwords = nltk.corpus.stopwords.words('turkish')
66+
sp = WhiteSpacePreprocessing(testing_data, stopwords_language="english")
67+
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()
6868

69-
kt = Kitty()
70-
kt.train(training, custom_embeddings=embeddings, topics=5, epochs=1,
71-
stopwords_list=turkish_stopwords, hidden_sizes=(200, 200))
69+
assert len(preprocessed_documents) == 2
70+
assert len(unpreprocessed_corpus) == 2
71+
assert len(vocab) >= 2
7272

7373

7474
def test_validation_set(data_dir):
@@ -81,10 +81,8 @@ def test_validation_set(data_dir):
8181
training_dataset = tp.fit(data[:100], data[:100])
8282
validation_dataset = tp.transform(data[100:105], data[100:105])
8383

84-
ctm = CombinedTM(reduce_on_plateau=True, solver='sgd', batch_size=2, bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5)
85-
ctm.fit(training_dataset, validation_dataset=validation_dataset, patience=5, save_dir=data_dir+'test_checkpoint')
86-
87-
assert os.path.exists(data_dir+"test_checkpoint")
84+
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5, batch_size=2)
85+
ctm.fit(training_dataset, validation_dataset)
8886

8987

9088
def test_training_all_classes_ctm(data_dir):
@@ -96,45 +94,58 @@ def test_training_all_classes_ctm(data_dir):
9694

9795
training_dataset = tp.fit(data, data)
9896
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5, batch_size=2)
99-
ctm.fit(training_dataset) # run the model
97+
ctm.fit(training_dataset)
10098

101-
testing_dataset = tp.transform(data)
102-
predictions = ctm.get_doc_topic_distribution(testing_dataset, n_samples=2)
99+
assert len(ctm.get_topics()) == 5
103100

104-
assert len(predictions) == len(testing_dataset)
101+
ctm.get_topic_lists(25)
105102

106-
topics = ctm.get_topic_lists(2)
107-
assert len(topics) == 5
103+
thetas = ctm.get_doc_topic_distribution(training_dataset, n_samples=5)
108104

109-
training_dataset = tp.fit(data, data)
110-
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5, batch_size=2)
111-
ctm.fit(training_dataset) # run the model
105+
assert len(thetas) == len(data)
106+
107+
predicted_topics = ctm.get_doc_topic_distribution(training_dataset, n_samples=5)
108+
109+
assert len(predicted_topics) == len(data)
110+
111+
ctm = CTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5, batch_size=2)
112+
ctm.fit(training_dataset)
113+
114+
assert len(ctm.get_topics()) == 5
115+
116+
ctm.get_topic_lists(25)
112117

113-
topics = ctm.get_topic_lists(2)
114-
assert len(topics) == 5
118+
thetas = ctm.get_doc_topic_distribution(training_dataset, n_samples=5)
115119

116-
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5,loss_weights={"beta": 10}, batch_size=2)
117-
ctm.fit(training_dataset) # run the model
118-
assert ctm.weights == {"beta": 10}
120+
assert len(thetas) == len(data)
119121

120-
topics = ctm.get_topic_lists(2)
121-
assert len(topics) == 5
122+
predicted_topics = ctm.get_doc_topic_distribution(training_dataset, n_samples=5)
122123

123-
testing_dataset = tp.transform(data, data)
124-
predictions = ctm.get_doc_topic_distribution(testing_dataset, n_samples=2)
124+
assert len(predicted_topics) == len(data)
125125

126-
assert len(predictions) == len(testing_dataset)
127126

127+
def test_training_ctm_combined_labels(data_dir):
128+
129+
with open(data_dir + '/gnews/GoogleNews.txt') as filino:
130+
data = filino.readlines()
131+
with open(data_dir + '/gnews/GoogleNews_LABEL.txt') as filino:
132+
labels = filino.readlines()
133+
134+
tp = TopicModelDataPreparation("paraphrase-distilroberta-base-v2")
135+
136+
training_dataset = tp.fit(data[:100], data[:100], labels=labels[:100])
137+
138+
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, num_epochs=1, n_components=5, batch_size=2,
139+
label_size=len(set(labels[:100])))
140+
ctm.fit(training_dataset)
141+
142+
assert len(ctm.get_topics()) == 5
128143

129-
def test_preprocessing(data_dir):
130-
docs = [line.strip() for line in open(data_dir + "gnews/GoogleNews.txt", 'r').readlines()]
131-
sp = WhiteSpacePreprocessing(docs, "english")
132-
prep_corpus, unprepr_corpus, vocab, retained_indices = sp.preprocess()
144+
ctm.get_topic_lists(25)
133145

134-
assert len(prep_corpus) == len(unprepr_corpus) # prep docs must have the same size as the unprep docs
135-
assert len(prep_corpus) <= len(docs) # preprocessed docs must be less than or equal the original docs
146+
thetas = ctm.get_doc_topic_distribution(training_dataset, n_samples=5)
136147

137-
assert len(vocab) <= sp.vocabulary_size # check vocabulary size
148+
assert len(thetas) == len(data[:100])
138149

139150

140151

tests/test_measures.py

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,68 @@
1-
import pytest
2-
import os
1+
#!/usr/bin/env python
32

4-
from contextualized_topic_models.models.ctm import ZeroShotTM
5-
from contextualized_topic_models.evaluation.measures import (
6-
CoherenceNPMI, CoherenceWordEmbeddings, CoherenceCV,
7-
InvertedRBO, TopicDiversity)
8-
from contextualized_topic_models.utils.data_preparation import (
9-
TopicModelDataPreparation)
3+
"""Tests for measures"""
104

5+
import pytest
6+
from contextualized_topic_models.models.ctm import ZeroShotTM
7+
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
8+
from contextualized_topic_models.evaluation.measures import CoherenceCV, CoherenceUMass, CoherenceNPMI, \
9+
InvertedRBO, TopicDiversity, TopicDiversityTF, Sil
10+
import os
1111

1212
@pytest.fixture
1313
def root_dir():
1414
return os.path.dirname(os.path.abspath(__file__))
1515

16-
1716
@pytest.fixture
1817
def data_dir(root_dir):
1918
return root_dir + "/../contextualized_topic_models/data/"
2019

20+
def test_diversities(data_dir):
2121

22-
@pytest.fixture
23-
def train_model(data_dir):
24-
with open(data_dir + 'gnews/GoogleNews.txt', 'r') as filino:
22+
with open(data_dir + '/sample_text_document') as filino:
2523
data = filino.readlines()
2624

2725
tp = TopicModelDataPreparation("distiluse-base-multilingual-cased")
2826

2927
training_dataset = tp.fit(data, data)
30-
ctm = ZeroShotTM(
31-
bow_size=len(tp.vocab), contextual_size=512,
32-
num_epochs=2, n_components=5)
28+
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5, batch_size=2)
3329
ctm.fit(training_dataset)
34-
return ctm
3530

31+
td_1 = TopicDiversity(topk=25)
32+
topic_diversity_1 = td_1.score(ctm.get_topic_lists(5))
3633

37-
def test_diversities(train_model):
34+
assert topic_diversity_1 >= 0
3835

39-
topics = train_model.get_topic_lists(25)
36+
td_2 = TopicDiversityTF(topk=25)
37+
topic_diversity_2 = td_2.score(ctm.get_topic_lists(5))
4038

41-
irbo = InvertedRBO(topics=topics)
42-
score = irbo.score()
43-
assert 0 <= score <= 1
39+
assert topic_diversity_2 >= 0
40+
41+
def test_coherences(data_dir):
42+
43+
with open(data_dir + '/sample_text_document') as filino:
44+
training = filino.readlines()
45+
46+
tp = TopicModelDataPreparation("distiluse-base-multilingual-cased")
47+
48+
training_dataset = tp.fit(training, training)
49+
50+
ctm = ZeroShotTM(bow_size=len(tp.vocab), contextual_size=512, num_epochs=1, n_components=5, batch_size=2)
51+
ctm.fit(training_dataset)
4452

45-
td = TopicDiversity(topics=topics)
46-
score = td.score()
47-
assert 0 <= score <= 1
53+
topic_words = ctm.get_topic_lists(5)
4854

55+
coherence_cv = CoherenceCV(texts=training, topk=3)
56+
cv = coherence_cv.score(topic_words)
4957

50-
def test_coherences(data_dir, train_model):
51-
with open(data_dir + 'gnews/GoogleNews.txt', "r") as fr:
52-
texts = [doc.split() for doc in fr.read().splitlines()]
58+
assert cv > -100
5359

54-
topics = train_model.get_topic_lists(10)
60+
coherence_npmi = CoherenceNPMI(texts=training, topk=3)
61+
npmi = coherence_npmi.score(topic_words)
5562

56-
npmi = CoherenceNPMI(texts=texts, topics=topics)
57-
score = npmi.score()
58-
assert -1 <= score <= 1
63+
assert npmi > -100
5964

60-
cv = CoherenceCV(texts=texts, topics=topics)
61-
score = cv.score()
62-
assert -1 <= score <= 1
65+
coherence_umass = CoherenceUMass(texts=training, topk=3)
66+
umass = coherence_umass.score(topic_words)
6367

64-
cwe = CoherenceWordEmbeddings(topics=topics)
65-
score = cwe.score()
66-
assert -1 <= score <= 1
68+
assert umass > -100

0 commit comments

Comments
 (0)