apache · christinadionysio · Sep 2, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 20, 2025
diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
@@ -38,6 +38,9 @@
 from systemds.scuro.representations.lstm import LSTM
 from systemds.scuro.representations.max import RowMax
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.multimodal_attention_fusion import (
+    AttentionFusion,
+)
 from systemds.scuro.representations.mfcc import MFCC
 from systemds.scuro.representations.hadamard import Hadamard
 from systemds.scuro.representations.optical_flow import OpticalFlow
@@ -73,6 +76,12 @@
 from systemds.scuro.drsearch.unimodal_representation_optimizer import (
     UnimodalRepresentationOptimizer,
 )
+from systemds.scuro.representations.covarep_audio_features import (
+    RMSE,
+    Spectral,
+    ZeroCrossing,
+    Pitch,
+)
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
@@ -131,4 +140,9 @@
     "UnimodalRepresentationOptimizer",
     "UnimodalOptimizer",
     "MultimodalOptimizer",
+    "ZeroCrossing",
+    "Pitch",
+    "RMSE",
+    "Spectral",
+    "AttentionFusion",
 ]
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 from typing import List, Optional, Union
-
 import librosa
 import numpy as np
 

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -98,6 +98,9 @@ def optimize_intramodal_representations(self, task):
                             ],
                         )
 
+    # TODO: check if order matters for reused reps - only compute once - check in cache
+    # TODO: parallelize - whenever an item of len 0 comes along give it to a new thread - merge results
+    # TODO: change the algorithm so that one representation is used until there is no more representations to add - saves a lot of memory
     def optimize_intermodal_representations(self, task):
         modality_combos = []
         n = len(self.k_best_cache[task.model.name])
@@ -122,8 +125,7 @@ def generate_extensions(current_combo, remaining_indices):
         reuse_fused_representations = False
         for i, modality_combo in enumerate(modality_combos):
             # clear reuse cache
-            if i % 5 == 0:
-                reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache)
+            reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache)
 
             if i != 0:
                 reuse_fused_representations = self.is_prefix_match(

diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -43,6 +43,12 @@ def __new__(cls):
                 cls._representations[m_type] = []
         return cls._instance
 
+    def set_fusion_operators(self, fusion_operators):
+        if isinstance(fusion_operators, list):
+            self._context_operators = fusion_operators
+        else:
+            self._fusion_operators = [fusion_operators]
+
     def add_representation(
         self, representation: Representation, modality: ModalityType
     ):
@@ -57,6 +63,13 @@ def add_fusion_operator(self, fusion_operator):
     def get_representations(self, modality: ModalityType):
         return self._representations[modality]
 
+    def get_not_self_contained_representations(self, modality: ModalityType):
+        reps = []
+        for rep in self.get_representations(modality):
+            if not rep().self_contained:
+                reps.append(rep)
+        return reps
+
     def get_context_operators(self):
         # TODO: return modality specific context operations
         return self._context_operators

diff --git a/src/main/python/systemds/scuro/drsearch/task.py b/src/main/python/systemds/scuro/drsearch/task.py
@@ -19,8 +19,10 @@
 #
 # -------------------------------------------------------------
 import time
-from typing import List
+from typing import List, Union
 
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.representation import Representation
 from systemds.scuro.models.model import Model
 import numpy as np
 from sklearn.model_selection import KFold
@@ -57,6 +59,8 @@ def __init__(
         self.inference_time = []
         self.training_time = []
         self.expected_dim = 1
+        self.train_scores = []
+        self.val_scores = []
 
     def get_train_test_split(self, data):
         X_train = [data[i] for i in self.train_indices]
@@ -73,28 +77,69 @@ def run(self, data):
          :param data: The aligned data used in the prediction process
          :return: the validation accuracy
         """
+        self._reset_params()
+        skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)
+
+        fold = 0
+        X, y, _, _ = self.get_train_test_split(data)
+
+        for train, test in skf.split(X, y):
+            train_X = np.array(X)[train]
+            train_y = np.array(y)[train]
+            test_X = np.array(X)[test]
+            test_y = np.array(y)[test]
+            self._run_fold(train_X, train_y, test_X, test_y)
+            fold += 1
+
+        if self.measure_performance:
+            self.inference_time = np.mean(self.inference_time)
+            self.training_time = np.mean(self.training_time)
+
+        return [np.mean(self.train_scores), np.mean(self.val_scores)]
+
+    def _reset_params(self):
         self.inference_time = []
         self.training_time = []
+        self.train_scores = []
+        self.val_scores = []
+
+    def _run_fold(self, train_X, train_y, test_X, test_y):
+        train_start = time.time()
+        train_score = self.model.fit(train_X, train_y, test_X, test_y)
+        train_end = time.time()
+        self.training_time.append(train_end - train_start)
+        self.train_scores.append(train_score)
+        test_start = time.time()
+        test_score = self.model.test(np.array(test_X), test_y)
+        test_end = time.time()
+        self.inference_time.append(test_end - test_start)
+        self.val_scores.append(test_score)
+
+    def create_representation_and_run(
+        self,
+        representation: Representation,
+        modalities: Union[List[Modality], Modality],
+    ):
+        self._reset_params()
         skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)
-        train_scores = []
-        test_scores = []
+
         fold = 0
-        X, y, X_test, y_test = self.get_train_test_split(data)
+        X, y, _, _ = self.get_train_test_split(data)
 
         for train, test in skf.split(X, y):
             train_X = np.array(X)[train]
             train_y = np.array(y)[train]
-            train_start = time.time()
-            train_score = self.model.fit(train_X, train_y, X_test, y_test)
-            train_end = time.time()
-            self.training_time.append(train_end - train_start)
-            train_scores.append(train_score)
-            test_start = time.time()
-            test_score = self.model.test(np.array(X_test), y_test)
-            test_end = time.time()
-            self.inference_time.append(test_end - test_start)
-            test_scores.append(test_score)
+            test_X = s.transform(np.array(X)[test])
+            test_y = np.array(y)[test]
+
+            if isinstance(modalities, Modality):
+                rep = modality.apply_representation(representation())
+            else:
+                representation().transform(
+                    train_X, train_y
+                )  # TODO: think about a way how to handle masks
 
+            self._run_fold(train_X, train_y, test_X, test_y)
             fold += 1
 
         if self.measure_performance:

diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -20,6 +20,7 @@
 # -------------------------------------------------------------
 import pickle
 import time
+import copy
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, field, asdict
 
@@ -28,11 +29,17 @@
 
 import numpy as np
 from systemds.scuro.representations.window_aggregation import WindowAggregation
+from systemds.scuro.representations.concatenation import Concatenation
+from systemds.scuro.representations.hadamard import Hadamard
+from systemds.scuro.representations.sum import Sum
 
 from systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
-from systemds.scuro import ModalityType, Aggregation
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.aggregate import Aggregation
 from systemds.scuro.drsearch.operator_registry import Registry
 from systemds.scuro.utils.schema_helpers import get_shape
 
@@ -84,7 +91,6 @@ def optimize_parallel(self, n_workers=None):
     def optimize(self):
         for modality in self.modalities:
             local_result = self._process_modality(modality, False)
-            # self._merge_results(local_result)
 
     def _process_modality(self, modality, parallel):
         if parallel:
@@ -95,43 +101,59 @@ def _process_modality(self, modality, parallel):
             local_results = self.operator_performance
 
         context_operators = self.operator_registry.get_context_operators()
-
-        for context_operator in context_operators:
-            context_representation = None
-            if (
-                modality.modality_type != ModalityType.TEXT
-                and modality.modality_type != ModalityType.VIDEO
-            ):
-                con_op = context_operator()
-                context_representation = modality.context(con_op)
-                self._evaluate_local(context_representation, [con_op], local_results)
-
-            modality_specific_operators = self.operator_registry.get_representations(
+        not_self_contained_reps = (
+            self.operator_registry.get_not_self_contained_representations(
                 modality.modality_type
             )
-            for modality_specific_operator in modality_specific_operators:
-                mod_context = None
-                mod_op = modality_specific_operator()
-                if context_representation is not None:
-                    mod_context = context_representation.apply_representation(mod_op)
-                    self._evaluate_local(mod_context, [con_op, mod_op], local_results)
-
-                mod = modality.apply_representation(mod_op)
-                self._evaluate_local(mod, [mod_op], local_results)
-
-                for context_operator_after in context_operators:
-                    con_op_after = context_operator_after()
-                    if mod_context is not None:
-                        mod_context = mod_context.context(con_op_after)
-                        self._evaluate_local(
-                            mod_context, [con_op, mod_op, con_op_after], local_results
-                        )
-
-                    mod = mod.context(con_op_after)
-                    self._evaluate_local(mod, [mod_op, con_op_after], local_results)
+        )
+        modality_specific_operators = self.operator_registry.get_representations(
+            modality.modality_type
+        )
+        for modality_specific_operator in modality_specific_operators:
+            mod_op = modality_specific_operator()
+
+            mod = modality.apply_representation(mod_op)
+            self._evaluate_local(mod, [mod_op], local_results)
+
+            if not mod_op.self_contained:
+                self._combine_non_self_contained_representations(
+                    modality, mod, not_self_contained_reps, local_results
+                )
+
+            for context_operator_after in context_operators:
+                con_op_after = context_operator_after()
+                mod = mod.context(con_op_after)
+                self._evaluate_local(mod, [mod_op, con_op_after], local_results)
 
             return local_results
 
+    def _combine_non_self_contained_representations(
+        self,
+        modality: Modality,
+        representation: TransformedModality,
+        other_representations,
+        local_results,
+    ):
+        combined = representation
+        context_operators = self.operator_registry.get_context_operators()
+        used_representations = representation.transformation
+        for other_representation in other_representations:
+            used_representations.append(other_representation())
+            for combination in [Concatenation(), Hadamard(), Sum()]:
+                combined = combined.combine(
+                    modality.apply_representation(other_representation()), combination
+                )
+                self._evaluate_local(
+                    combined, used_representations, local_results, combination
+                )
+
+                for context_op in context_operators:
+                    con_op = context_op()
+                    mod = combined.context(con_op)
+                    c_t = copy.deepcopy(used_representations)
+                    c_t.append(con_op)
+                    self._evaluate_local(mod, c_t, local_results, combination)
+
     def _merge_results(self, local_results):
         """Merge local results into the main results"""
         for modality_id in local_results.results:
@@ -145,7 +167,9 @@ def _merge_results(self, local_results):
                 for key, value in local_results.cache[modality][task_name].items():
                     self.operator_performance.cache[modality][task_name][key] = value
 
-    def _evaluate_local(self, modality, representations, local_results):
+    def _evaluate_local(
+        self, modality, representations, local_results, combination=None
+    ):
         if self._tasks_require_same_dims:
             if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
                 # for aggregation in Aggregation().get_aggregation_functions():
@@ -165,6 +189,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
             else:
                 modality.pad()
@@ -178,6 +203,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
         else:
             for task in self.tasks:
@@ -198,6 +224,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
                 else:
                     # modality.pad()
@@ -210,6 +237,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
 
 
@@ -228,7 +256,9 @@ def __init__(self, modalities, tasks, debug=False):
                 self.cache[modality][task_name] = {}
                 self.results[modality][task_name] = []
 
-    def add_result(self, scores, representations, modality, task_name, task_time):
+    def add_result(
+        self, scores, representations, modality, task_name, task_time, combination
+    ):
         parameters = []
         representation_names = []
 
@@ -256,6 +286,7 @@ def add_result(self, scores, representations, modality, task_name, task_time):
             val_score=scores[1],
             representation_time=modality.transform_time,
             task_time=task_time,
+            combination=combination.name if combination else "",
         )
         self.results[modality.modality_id][task_name].append(entry)
         self.cache[modality.modality_id][task_name][
@@ -302,3 +333,4 @@ class ResultEntry:
     train_score: float
     representation_time: float
     task_time: float
+    combination: str
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
@@ -168,6 +168,6 @@ def is_aligned(self, other_modality):
                 != list(other_modality.metadata.values())[i]["data_layout"]["shape"]
             ):
                 aligned = False
-                continue
+                break
 
         return aligned