add additional text representations

christinadionysio · christinadionysio · commit 5b3e87b5fbed · 2024-11-26T10:48:54.000+01:00
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
@@ -19,20 +19,12 @@
 #
 # -------------------------------------------------------------
 
-import pickle
-
 import numpy as np
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
 from transformers import BertTokenizer, BertModel
-import os
-
-
-def read_text_file(file_path):
-    with open(file_path, "r", encoding="utf-8") as file:
-        text = file.read()
-    return text
+from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
 
 
 class Bert(UnimodalRepresentation):
@@ -42,18 +34,8 @@ def __init__(self, avg_layers=None, output_file=None):
         self.avg_layers = avg_layers
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices, get_sequences=False):
-        # Assumes text is stored in .txt files
-        data = []
-        if os.path.isdir(filepath):
-            for filename in os.listdir(filepath):
-                f = os.path.join(filepath, filename)
-                if os.path.isfile(f):
-                    with open(f, "r") as file:
-                        data.append(file.readlines()[0])
-        else:
-            with open(filepath, "r") as file:
-                data = file.readlines()
+    def parse_all(self, filepath, indices):
+        data = read_data_from_file(filepath, indices)
 
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
@@ -65,13 +47,13 @@ def parse_all(self, filepath, indices, get_sequences=False):
         else:
             model = BertModel.from_pretrained(model_name)
 
-        embeddings = self.create_embeddings(data, model, tokenizer)
+        embeddings = self.create_embeddings(list(data.values()), model, tokenizer)
 
         if self.output_file is not None:
             data = {}
             for i in range(0, embeddings.shape[0]):
                 data[indices[i]] = embeddings[i]
-            self.save_embeddings(data)
+            save_embeddings(data, self.output_file)
 
         return embeddings
 
@@ -88,14 +70,13 @@ def create_embeddings(self, data, model, tokenizer):
                     outputs.hidden_states[i][:, 0, :]
                     for i in range(-self.avg_layers, 0)
                 ]
-                cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0)
+                cls_embedding = torch.mean(torch.stack(cls_embedding), dim=0).numpy()
             else:
                 cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
-            embeddings.append(cls_embedding.numpy())
+            embeddings.append(cls_embedding)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
 
         embeddings = np.array(embeddings)
         return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
-
-    def save_embeddings(self, data):
-        with open(self.output_file, "wb") as file:
-            pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
@@ -0,0 +1,51 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+
+
+class BoW(UnimodalRepresentation):
+    def __init__(self, ngram_range, min_df, output_file=None):
+        super().__init__("BoW")
+        self.ngram_range = ngram_range
+        self.min_df = min_df
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        vectorizer = CountVectorizer(
+            ngram_range=(1, self.ngram_range), min_df=self.min_df
+        )
+
+        segments = read_data_from_file(filepath, indices)
+        X = vectorizer.fit_transform(segments.values())
+        X = X.toarray()
+
+        if self.output_file is not None:
+            df = pd.DataFrame(X)
+            df.index = segments.keys()
+
+            save_embeddings(df, self.output_file)
+
+        return X
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import nltk
+import numpy as np
+from nltk import word_tokenize
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+
+
+def load_glove_embeddings(file_path):
+    embeddings = {}
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            values = line.split()
+            word = values[0]
+            vector = np.asarray(values[1:], dtype="float32")
+            embeddings[word] = vector
+    return embeddings
+
+
+class GloVe(UnimodalRepresentation):
+    def __init__(self, glove_path, output_file=None):
+        super().__init__("GloVe")
+        self.glove_path = glove_path
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        glove_embeddings = load_glove_embeddings(self.glove_path)
+        segments = read_data_from_file(filepath, indices)
+
+        embeddings = {}
+        for k, v in segments.items():
+            tokens = word_tokenize(v.lower())
+            embeddings[k] = np.mean(
+                [
+                    glove_embeddings[token]
+                    for token in tokens
+                    if token in glove_embeddings
+                ],
+                axis=0,
+            )
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        embeddings = np.array(list(embeddings.values()))
+        return embeddings
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -0,0 +1,48 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+
+
+class TfIdf(UnimodalRepresentation):
+    def __init__(self, min_df, output_file=None):
+        super().__init__("TF-IDF")
+        self.min_df = min_df
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        vectorizer = TfidfVectorizer(min_df=self.min_df)
+
+        segments = read_data_from_file(filepath, indices)
+        X = vectorizer.fit_transform(segments.values())
+        X = X.toarray()
+
+        if self.output_file is not None:
+            df = pd.DataFrame(X)
+            df.index = segments.keys()
+
+            save_embeddings(df, self.output_file)
+
+        return X
diff --git a/src/main/python/systemds/scuro/representations/utils.py b/src/main/python/systemds/scuro/representations/utils.py
@@ -18,6 +18,8 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import os
+import pickle
 
 import numpy as np
 
@@ -33,3 +35,39 @@ def pad_sequences(sequences, maxlen=None, dtype="float32", value=0):
         result[i, : len(data)] = data
 
     return result
+
+
+def get_segments(data, key_prefix):
+    segments = {}
+    counter = 1
+    for line in data:
+        line = line.replace("\n", "")
+        segments[key_prefix + str(counter)] = line
+        counter += 1
+
+    return segments
+
+
+def read_data_from_file(filepath, indices):
+    data = {}
+
+    is_dir = True if os.path.isdir(filepath) else False
+
+    if is_dir:
+        files = os.listdir(filepath)
+
+        # get file extension
+        _, ext = os.path.splitext(files[0])
+        for key in indices:
+            with open(filepath + key + ext) as segm:
+                data.update(get_segments(segm, key + "_"))
+    else:
+        with open(filepath) as file:
+            data.update(get_segments(file, ""))
+
+    return data
+
+
+def save_embeddings(data, file_name):
+    with open(file_name, "wb") as file:
+        pickle.dump(data, file)
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -0,0 +1,65 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+from gensim.models import Word2Vec
+from nltk.tokenize import word_tokenize
+import nltk
+
+
+def get_embedding(sentence, model):
+    vectors = []
+    for word in sentence:
+        if word in model.wv:
+            vectors.append(model.wv[word])
+
+    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
+
+
+class W2V(UnimodalRepresentation):
+    def __init__(self, vector_size, min_count, window, output_file=None):
+        super().__init__("Word2Vec")
+        self.vector_size = vector_size
+        self.min_count = min_count
+        self.window = window
+        self.output_file = output_file
+
+    def parse_all(self, filepath, indices):
+        segments = read_data_from_file(filepath, indices)
+        embeddings = {}
+        t = [word_tokenize(s.lower()) for s in segments.values()]
+        model = Word2Vec(
+            sentences=t,
+            vector_size=self.vector_size,
+            window=self.window,
+            min_count=self.min_count,
+        )
+
+        for k, v in segments.items():
+            tokenized_words = word_tokenize(v.lower())
+            embeddings[k] = get_embedding(tokenized_words, model)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        return np.array(list(embeddings.values()))
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
@@ -36,7 +36,7 @@ def __init__(self, modalities, path, balanced=True):
         self.balanced = balanced
 
         for modality in modalities:
-            mod_path = f"{self.path}/{modality.name.lower()}"
+            mod_path = f"{self.path}/{modality.name.lower()}/"
             os.mkdir(mod_path)
             modality.file_path = mod_path
         self.labels = []