Skip to content

Commit 3cc0728

Browse files
change to previous tokenizer
1 parent 254e312 commit 3cc0728

File tree

2 files changed

+8
-16
lines changed

2 files changed

+8
-16
lines changed

src/main/python/systemds/scuro/representations/tfidf.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,11 @@
1919
#
2020
# -------------------------------------------------------------
2121
import numpy as np
22-
from textblob import TextBlob
2322

23+
from sklearn.feature_extraction.text import TfidfVectorizer
2424
from systemds.scuro.modality.transformed import TransformedModality
2525
from systemds.scuro.representations.unimodal import UnimodalRepresentation
2626
from systemds.scuro.representations.utils import save_embeddings
27-
from gensim import models
28-
from gensim.corpora import Dictionary
29-
30-
import nltk
31-
32-
nltk.download("punkt_tab")
3327

3428

3529
class TfIdf(UnimodalRepresentation):
@@ -43,12 +37,10 @@ def transform(self, modality):
4337
modality.modality_type, self, modality.metadata
4438
)
4539

46-
tokens = [list(TextBlob(s).words) for s in modality.data]
47-
dictionary = Dictionary()
48-
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in tokens]
49-
tfidf = models.TfidfModel(BoW_corpus, smartirs="ntc")
50-
X = tfidf[BoW_corpus]
51-
X = [np.array(x)[:, 1].reshape(1, -1) for x in X]
40+
vectorizer = TfidfVectorizer(min_df=self.min_df)
41+
42+
X = vectorizer.fit_transform(modality.data)
43+
X = [np.array(x).reshape(1, -1) for x in X.toarray()]
5244

5345
if self.output_file is not None:
5446
save_embeddings(X, self.output_file)

src/main/python/systemds/scuro/representations/word2vec.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from systemds.scuro.representations.unimodal import UnimodalRepresentation
2424
from systemds.scuro.representations.utils import save_embeddings
2525
from gensim.models import Word2Vec
26-
from textblob import TextBlob
26+
from gensim.utils import tokenize
2727

2828
import nltk
2929

@@ -51,7 +51,7 @@ def transform(self, modality):
5151
transformed_modality = TransformedModality(
5252
modality.modality_type, self, modality.metadata
5353
)
54-
t = [list(TextBlob(s).words) for s in modality.data]
54+
t = [list(tokenize(s.lower())) for s in modality.data]
5555
model = Word2Vec(
5656
sentences=t,
5757
vector_size=self.vector_size,
@@ -60,7 +60,7 @@ def transform(self, modality):
6060
)
6161
embeddings = []
6262
for sentences in modality.data:
63-
tokens = list(TextBlob(sentences).words)
63+
tokens = list(tokenize(sentences.lower()))
6464
embeddings.append(np.array(get_embedding(tokens, model)).reshape(1, -1))
6565

6666
if self.output_file is not None:

0 commit comments

Comments
 (0)