apache
diff --git a/‎.github/workflows/python.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/python.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/main/python/systemds/scuro/__init__.py‎
Lines changed: 10 additions & 1 deletion b/‎src/main/python/systemds/scuro/__init__.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/main/python/systemds/scuro/drsearch/operator_registry.py‎
Lines changed: 28 additions & 0 deletions b/‎src/main/python/systemds/scuro/drsearch/operator_registry.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/main/python/systemds/scuro/drsearch/representation_dag.py‎
Lines changed: 5 additions & 0 deletions b/‎src/main/python/systemds/scuro/drsearch/representation_dag.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py‎
Lines changed: 39 additions & 4 deletions b/‎src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py‎
Lines changed: 39 additions & 4 deletions
diff --git a/‎src/main/python/systemds/scuro/modality/transformed.py‎
Lines changed: 9 additions & 0 deletions b/‎src/main/python/systemds/scuro/modality/transformed.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/main/python/systemds/scuro/representations/dimensionality_reduction.py‎
Lines changed: 81 additions & 0 deletions b/‎src/main/python/systemds/scuro/representations/dimensionality_reduction.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎src/main/python/systemds/scuro/representations/glove.py‎
Lines changed: 12 additions & 10 deletions b/‎src/main/python/systemds/scuro/representations/glove.py‎
Lines changed: 12 additions & 10 deletions
@@ -142,7 +142,7 @@ jobs:
         export PATH=$SYSTEMDS_ROOT/bin:$PATH
         cd src/main/python
         ./tests/federated/runFedTest.sh
-
+    
     - name: Cache Torch Hub
       if: ${{ matrix.test_mode == 'scuro' }}
       id: torch-cache
@@ -158,6 +158,8 @@ jobs:
       env:
         TORCH_HOME: ${{ github.workspace }}/.torch
       run: |
+        df -h
+        exit
         ( while true; do echo "."; sleep 25; done ) &
         KA=$!
         pip install --upgrade pip wheel setuptools
 
@@ -116,7 +116,13 @@
     OverlappingSplitIndices,
 )
 from systemds.scuro.representations.elmo import ELMoRepresentation
-
+from systemds.scuro.representations.dimensionality_reduction import (
+    DimensionalityReduction,
+)
+from systemds.scuro.representations.mlp_averaging import MLPAveraging
+from systemds.scuro.representations.mlp_learned_dim_reduction import (
+    MLPLearnedDimReduction,
+)
 
 __all__ = [
     "BaseLoader",
@@ -202,4 +208,7 @@
     "ELMoRepresentation",
     "SentenceBoundarySplitIndices",
     "OverlappingSplitIndices",
+    "MLPAveraging",
+    "MLPLearnedDimReduction",
+    "DimensionalityReduction",
 ]
@@ -37,6 +37,7 @@ class Registry:
     _fusion_operators = []
     _text_context_operators = []
     _video_context_operators = []
+    _dimensionality_reduction_operators = {}
 
     def __new__(cls):
         if not cls._instance:
@@ -73,6 +74,18 @@ def add_context_operator(self, context_operator, modality_type):
     def add_fusion_operator(self, fusion_operator):
         self._fusion_operators.append(fusion_operator)
 
+    def add_dimensionality_reduction_operator(
+        self, dimensionality_reduction_operator, modality_type
+    ):
+        if not isinstance(modality_type, list):
+            modality_type = [modality_type]
+        for m_type in modality_type:
+            if not m_type in self._dimensionality_reduction_operators.keys():
+                self._dimensionality_reduction_operators[m_type] = []
+            self._dimensionality_reduction_operators[m_type].append(
+                dimensionality_reduction_operator
+            )
+
     def get_representations(self, modality: ModalityType):
         return self._representations[modality]
 
@@ -86,6 +99,9 @@ def get_not_self_contained_representations(self, modality: ModalityType):
     def get_context_operators(self, modality_type):
         return self._context_operators[modality_type]
 
+    def get_dimensionality_reduction_operators(self, modality_type):
+        return self._dimensionality_reduction_operators[modality_type]
+
     def get_fusion_operators(self):
         return self._fusion_operators
 
@@ -127,6 +143,18 @@ def decorator(cls):
     return decorator
 
 
+def register_dimensionality_reduction_operator(modality_type):
+    """
+    Decorator to register a dimensionality reduction operator.
+    """
+
+    def decorator(cls):
+        Registry().add_dimensionality_reduction_operator(cls, modality_type)
+        return cls
+
+    return decorator
+
+
 def register_context_operator(modality_type):
     """
     Decorator to register a context operator.
 
@@ -30,6 +30,9 @@
     AggregatedRepresentation,
 )
 from systemds.scuro.representations.context import Context
+from systemds.scuro.representations.dimensionality_reduction import (
+    DimensionalityReduction,
+)
 from systemds.scuro.utils.identifier import get_op_id, get_node_id
 
 from collections import OrderedDict
@@ -195,6 +198,8 @@ def execute_node(node_id: str, task) -> TransformedModality:
                     # It's a unimodal operation
                     if isinstance(node_operation, Context):
                         result = input_mods[0].context(node_operation)
+                    elif isinstance(node_operation, DimensionalityReduction):
+                        result = input_mods[0].dimensionality_reduction(node_operation)
                     elif isinstance(node_operation, AggregatedRepresentation):
                         result = node_operation.transform(input_mods[0])
                     elif isinstance(node_operation, UnimodalRepresentation):
 
@@ -25,7 +25,6 @@
 import multiprocessing as mp
 from typing import List, Any
 from functools import lru_cache
-from systemds.scuro.drsearch.task import Task
 from systemds.scuro import ModalityType
 from systemds.scuro.drsearch.ranking import rank_by_tradeoff
 from systemds.scuro.drsearch.task import PerformanceMeasure
@@ -92,6 +91,12 @@ def _get_not_self_contained_reps(self, modality_type):
     def _get_context_operators(self, modality_type):
         return self.operator_registry.get_context_operators(modality_type)
 
+    @lru_cache(maxsize=32)
+    def _get_dimensionality_reduction_operators(self, modality_type):
+        return self.operator_registry.get_dimensionality_reduction_operators(
+            modality_type
+        )
+
     def store_results(self, file_name=None):
         if file_name is None:
             import time
@@ -185,9 +190,7 @@ def _process_modality(self, modality, parallel):
 
         external_cache = LRUCache(max_size=32)
         for dag in dags:
-            representations = dag.execute(
-                [modality], task=self.tasks[0], external_cache=external_cache
-            )  # TODO: dynamic task selection
+            representations = dag.execute([modality], external_cache=external_cache)
             node_id = list(representations.keys())[-1]
             node = dag.get_node_by_id(node_id)
             if node.operation is None:
@@ -303,6 +306,27 @@ def _evaluate_local(self, modality, local_results, dag, combination=None):
                         scores, modality, task.model.name, end - start, combination, dag
                     )
 
+    def add_dimensionality_reduction_operators(self, builder, current_node_id):
+        dags = []
+        modality_type = (
+            builder.get_node(current_node_id).operation().output_modality_type
+        )
+
+        if modality_type is not ModalityType.EMBEDDING:
+            return None
+
+        dimensionality_reduction_operators = (
+            self._get_dimensionality_reduction_operators(modality_type)
+        )
+        for dimensionality_reduction_op in dimensionality_reduction_operators:
+            dimensionality_reduction_node_id = builder.create_operation_node(
+                dimensionality_reduction_op,
+                [current_node_id],
+                dimensionality_reduction_op().get_current_parameters(),
+            )
+            dags.append(builder.build(dimensionality_reduction_node_id))
+        return dags
+
     def _build_modality_dag(
         self, modality: Modality, operator: Any
     ) -> List[RepresentationDag]:
@@ -316,6 +340,12 @@ def _build_modality_dag(
         current_node_id = rep_node_id
         dags.append(builder.build(current_node_id))
 
+        dimensionality_reduction_dags = self.add_dimensionality_reduction_operators(
+            builder, current_node_id
+        )
+        if dimensionality_reduction_dags is not None:
+            dags.extend(dimensionality_reduction_dags)
+
         if operator.needs_context:
             context_operators = self._get_context_operators(modality.modality_type)
             for context_op in context_operators:
@@ -339,6 +369,11 @@ def _build_modality_dag(
                     [context_node_id],
                     operator.get_current_parameters(),
                 )
+                dimensionality_reduction_dags = self.add_dimensionality_reduction_operators(
+                    builder, context_rep_node_id
+                )  # TODO: check if this is correctly using the 3d approach of the dimensionality reduction operator
+                if dimensionality_reduction_dags is not None:
+                    dags.extend(dimensionality_reduction_dags)
 
                 agg_operator = AggregatedRepresentation()
                 context_agg_node_id = builder.create_operation_node(
 
@@ -122,6 +122,15 @@ def context(self, context_operator):
         transformed_modality.transform_time += time.time() - start
         return transformed_modality
 
+    def dimensionality_reduction(self, dimensionality_reduction_operator):
+        transformed_modality = TransformedModality(
+            self, dimensionality_reduction_operator, self_contained=self.self_contained
+        )
+        start = time.time()
+        transformed_modality.data = dimensionality_reduction_operator.execute(self.data)
+        transformed_modality.transform_time += time.time() - start
+        return transformed_modality
+
     def apply_representation(self, representation):
         start = time.time()
         new_modality = representation.transform(self)
 
@@ -0,0 +1,81 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import abc
+
+import numpy as np
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.representation import Representation
+
+
+class DimensionalityReduction(Representation):
+    def __init__(self, name, parameters=None):
+        """
+        Parent class for different dimensionality reduction operations
+        :param name: Name of the dimensionality reduction operator
+        """
+        super().__init__(name, parameters)
+        self.needs_training = False
+
+    @abc.abstractmethod
+    def execute(self, data, labels=None):
+        """
+        Implemented for every child class and creates a sampled representation for a given modality
+        :param data: data to apply the dimensionality reduction on
+        :param labels: labels for learned dimensionality reduction
+        :return: dimensionality reduced data
+        """
+        if labels is not None:
+            self.execute_with_training(data, labels)
+        else:
+            self.execute(data)
+
+    def apply_representation(self, data):
+        """
+        Implemented for every child class and creates a dimensionality reduced representation for a given modality
+        :param data: data to apply the representation on
+        :return: dimensionality reduced data
+        """
+        raise f"Not implemented for Dimensionality Reduction Operator: {self.name}"
+
+    def execute_with_training(self, modality, task):
+        fusion_train_indices = task.fusion_train_indices
+        # Handle 3d data
+        data = modality.data
+        if (
+            len(np.array(modality.data).shape) == 3
+            and np.array(modality.data).shape[1] == 1
+        ):
+            data = np.array([x.reshape(-1) for x in modality.data])
+        transformed_train = self.execute(
+            np.array(data)[fusion_train_indices], task.labels[fusion_train_indices]
+        )
+
+        all_other_indices = [
+            i for i in range(len(modality.data)) if i not in fusion_train_indices
+        ]
+        transformed_other = self.apply_representation(np.array(data)[all_other_indices])
+
+        transformed_data = np.zeros((len(data), transformed_train.shape[1]))
+        transformed_data[fusion_train_indices] = transformed_train
+        transformed_data[all_other_indices] = transformed_other
+
+        return transformed_data
@@ -59,18 +59,20 @@ def transform(self, modality):
         glove_embeddings = load_glove_embeddings(self.glove_path)
 
         embeddings = []
+        embedding_dim = (
+            len(next(iter(glove_embeddings.values()))) if glove_embeddings else 100
+        )
+
         for sentences in modality.data:
             tokens = list(tokenize(sentences.lower()))
-            embeddings.append(
-                np.mean(
-                    [
-                        glove_embeddings[token]
-                        for token in tokens
-                        if token in glove_embeddings
-                    ],
-                    axis=0,
-                )
-            )
+            token_embeddings = [
+                glove_embeddings[token] for token in tokens if token in glove_embeddings
+            ]
+
+            if len(token_embeddings) > 0:
+                embeddings.append(np.mean(token_embeddings, axis=0))
+            else:
+                embeddings.append(np.zeros(embedding_dim, dtype=np.float32))
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)