From 21352eeec181b1ce095b62762cd41ff1388f2659 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 28 Jul 2025 14:13:02 +0200
Subject: [PATCH 01/19] add imporved unimodal optimizer

---
 .../scuro/drsearch/unimodal_optimizer.py      | 119 ++++++++++++++++++
 .../systemds/scuro/utils/schema_helpers.py    |   4 +
 2 files changed, 123 insertions(+)
 create mode 100644 src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py

diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
new file mode 100644
index 00000000000..24839088df8
--- /dev/null
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -0,0 +1,119 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from build.lib.systemds.scuro.representations.aggregated_representation import AggregatedRepresentation
+from systemds.scuro import ModalityType, Aggregation
+from systemds.scuro.drsearch.operator_registry import Registry
+from systemds.scuro.utils.schema_helpers import get_shape
+
+
+class UnimodalOptimizer:
+    def __init__(self, modalities, tasks):
+        self.modalities = modalities
+        self.tasks = tasks
+        
+        self.operator_registry = Registry()
+        self.operator_performance = {}
+
+        for modality in self.modalities:
+            self.operator_performance[modality.modality_id] = {}
+            for task in tasks:
+                self.operator_performance[modality.modality_id][task.model.name] = UnimodalResults(modality.modality_id, task.name)
+        
+    
+    def get_k_best_results(self, modality, k, task):
+        """
+        Get the k best results for the given modality
+        :param modality: modality to get the best results for
+        :param k: number of best results
+        """
+        
+        results = self.operator_performance[modality.modality_id][task.model.name].get_k_best_results(k)
+
+        return results
+    
+    def optimize(self):
+        for modality in self.modalities:
+            context_operators = self.operator_registry.get_context_operators()
+            
+            for context_operator in context_operators:
+                context_representation = None
+                if modality.modality_type != ModalityType.TEXT:
+                    con_op = context_operator()
+                    context_representation = modality.context(con_op)
+                    self.evaluate(context_representation, [context_operator.__name__], [con_op.parameters])
+                
+                modality_specific_operators = self.operator_registry.get_representations(modality.modality_type)
+                for modality_specific_operator in modality_specific_operators:
+                    mod_context = None
+                    mod_op = modality_specific_operator()
+                    if context_representation is not None:
+                        mod_context = context_representation.apply_representation(mod_op)
+                        self.evaluate(mod_context, [context_operator.__name__, modality_specific_operator.__name__], [con_op.parameters, mod_op.parameters])
+                    
+                    
+                    mod = modality.apply_representation(mod_op)
+                    self.evaluate(mod, [modality_specific_operator.__name__],
+                                  [mod_op.parameters])
+                    
+                    for context_operator_after in context_operators:
+                        con_op_after = context_operator_after()
+                        if mod_context is not None:
+                            mod_context = mod_context.context(con_op_after)
+                            self.evaluate(mod_context,
+                                          [context_operator.__name__, modality_specific_operator.__name__, context_operator_after.__name__],
+                                          [con_op.parameters, mod_op.parameters, con_op_after.parameters])
+                        
+                        mod = mod.context(con_op_after)
+                        self.evaluate(mod, [modality_specific_operator.__name__, context_operator_after.__name__],
+                                      [mod_op.parameters, con_op_after.parameters])
+    
+    def evaluate(self, modality, representation_names, params):
+        for task in self.tasks:
+            if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
+                for aggregation in Aggregation().get_aggregation_functions():
+                    # padding should not be necessary here
+                    agg_operator = AggregatedRepresentation(Aggregation(aggregation, False))
+                    agg_modality = agg_operator.transform(modality)
+                    
+                    scores = task.run(agg_modality.data)
+                    rep_names = representation_names.copy()
+                    rep_names.append(agg_operator.name)
+                    
+                    rep_params = params.copy()
+                    rep_params.append(agg_operator.parameters)
+                    self.operator_performance[modality.modality_id][task.model.name].add_result(scores, rep_names, rep_params)
+            else:
+                scores = task.run(modality.data)
+                self.operator_performance[modality.modality_id][task.model.name].add_result(scores, representation_names, params)
+                
+                    
+class UnimodalResults:
+    def __init__(self, modality_id, task_name):
+        self.modality_id = modality_id
+        self.task_name = task_name
+        self.results = {'representations': [], 'params': [], 'train_score': [], 'val_score':[]}
+    
+    def add_result(self, scores, representations, params):
+        self.results['representations'].append(representations)
+        self.results['params'].append([param.copy() if param is not None else param for param in params ])
+        self.results['train_score'].append(scores[0])
+        self.results['val_score'].append(scores[1])
+    
\ No newline at end of file
diff --git a/src/main/python/systemds/scuro/utils/schema_helpers.py b/src/main/python/systemds/scuro/utils/schema_helpers.py
index 28af476cca4..3d1fbf4d71a 100644
--- a/src/main/python/systemds/scuro/utils/schema_helpers.py
+++ b/src/main/python/systemds/scuro/utils/schema_helpers.py
@@ -40,3 +40,7 @@ def calculate_new_frequency(new_length, old_length, old_frequency):
     duration = old_length / old_frequency
     new_frequency = new_length / duration
     return new_frequency
+
+
+def get_shape(metadata):
+    return len(list(metadata.values())[0]["data_layout"]["shape"])

From faa31a2bcd9f86d9a15dfd4cc2de9fc3e51ba94f Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 28 Jul 2025 14:25:39 +0200
Subject: [PATCH 02/19] add subclass to store results

---
 .../tests/scuro/test_unimodal_optimizer.py     | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index 9ed034e5fe8..41bd2af1367 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -20,7 +20,6 @@
 # -------------------------------------------------------------
 
 
-import shutil
 import unittest
 
 import numpy as np
@@ -31,8 +30,8 @@
 from systemds.scuro.drsearch.operator_registry import Registry
 from systemds.scuro.models.model import Model
 from systemds.scuro.drsearch.task import Task
-from systemds.scuro.drsearch.unimodal_representation_optimizer import (
-    UnimodalRepresentationOptimizer,
+from systemds.scuro.drsearch.unimodal_optimizer import (
+    UnimodalOptimizer,
 )
 
 from systemds.scuro.representations.spectrogram import Spectrogram
@@ -41,9 +40,6 @@
 from systemds.scuro.representations.resnet import ResNet
 from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader
 
-from systemds.scuro.dataloader.audio_loader import AudioLoader
-from systemds.scuro.dataloader.video_loader import VideoLoader
-from systemds.scuro.dataloader.text_loader import TextLoader
 from systemds.scuro.modality.type import ModalityType
 
 
@@ -186,21 +182,21 @@ def optimize_unimodal_representation_for_modality(self, modality):
         ):
             registry = Registry()
 
-            unimodal_optimizer = UnimodalRepresentationOptimizer(
-                [modality], self.tasks, max_chain_depth=2
+            unimodal_optimizer = UnimodalOptimizer(
+                [modality], self.tasks
             )
             unimodal_optimizer.optimize()
 
             assert (
-                list(unimodal_optimizer.optimization_results.keys())[0]
+                list(unimodal_optimizer.operator_performance.keys())[0]
                 == modality.modality_id
             )
-            assert len(list(unimodal_optimizer.optimization_results.values())[0]) == 2
+            assert len(list(unimodal_optimizer.operator_performance.values())[0]) == 2
             assert (
                 len(
                     unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[
                         0
-                    ].operator_chain
+                    ].representations
                 )
                 >= 1
             )

From 1b6a2ed265428d812312e9ac3d87489d3cc78afa Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Tue, 29 Jul 2025 10:30:57 +0200
Subject: [PATCH 03/19] add params as strings

---
 .../scuro/drsearch/multimodal_optimizer.py    | 103 +++++++++++++
 .../scuro/drsearch/operator_registry.py       |  12 ++
 .../scuro/drsearch/unimodal_optimizer.py      | 136 ++++++++++++------
 3 files changed, 205 insertions(+), 46 deletions(-)
 create mode 100644 src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
new file mode 100644
index 00000000000..6f064f896c5
--- /dev/null
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -0,0 +1,103 @@
+from systemds.scuro.representations.aggregated_representation import (
+    AggregatedRepresentation,
+)
+
+from systemds.scuro.representations.aggregate import Aggregation
+
+from systemds.scuro.drsearch.operator_registry import Registry
+
+from systemds.scuro.utils.schema_helpers import get_shape
+import dataclasses
+
+
+class MultimodalOptimizer:
+    def __init__(self, modalities, unimodal_optimizer, tasks, k=2):
+        self.k_best_modalities = None
+        self.modalities = modalities
+        self.unimodal_optimizer = unimodal_optimizer
+        self.tasks = tasks
+        self.k = k
+        self.extract_k_best_modalities_per_task()
+        self.operator_registry = Registry()
+        self.optimization_results = {}
+
+    def optimize(self):
+        for task in self.tasks:
+            for modality in self.modalities:
+                representations = self.k_best_modalities[task][modality.modality_id]
+                applied_representations = []
+                for i in range(0, len(representations)):
+                    applied_representation = modality
+                    for j, rep in enumerate(representations[i].representations):
+                        representation, is_context = (
+                            self.operator_registry.get_representation_by_name(
+                                rep, modality.modality_type
+                            )
+                        )
+                        if representation is None:
+                            if rep == AggregatedRepresentation.__name__:
+                                representation = AggregatedRepresentation(Aggregation())
+                        else:
+                            representation = representation()
+                        representation.set_parameters(representations[i].params[j])
+                        if is_context:
+                            applied_representation = applied_representation.context(
+                                representation
+                            )
+                        else:
+                            applied_representation = (
+                                applied_representation.apply_representation(
+                                    representation
+                                )
+                            )
+                    applied_representations.append(applied_representation)
+
+    def evaluate(self, task, modality, representation_names, params):
+        if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
+            for aggregation in Aggregation().get_aggregation_functions():
+                # padding should not be necessary here
+                agg_operator = AggregatedRepresentation(Aggregation(aggregation, False))
+                agg_modality = agg_operator.transform(modality)
+
+                scores = task.run(agg_modality.data)
+                rep_names = representation_names.copy()
+                rep_names.append(agg_operator.name)
+
+                rep_params = params.copy()
+                rep_params.append(agg_operator.parameters)
+                self.optimization_results[modality.modality_id][
+                    task.model.name
+                ].add_result(scores, rep_names, rep_params)
+        else:
+            scores = task.run(modality.data)
+            self.optimization_results[modality.modality_id][task.model.name].add_result(
+                scores, representation_names, params
+            )
+
+    def extract_k_best_modalities_per_task(self):
+        self.k_best_modalities = {}
+        for task in self.tasks:
+            self.k_best_modalities[task] = {}
+            for modality in self.modalities:
+                self.k_best_modalities[task][modality.modality_id] = (
+                    self.unimodal_optimizer.get_k_best_results(modality, self.k, task)
+                )
+
+
+class MultimodalResults:
+    def __init__(self, modality, task):
+        self.modality_id = modality.modality_id
+        self.task = task
+
+        self.results = []
+
+
+@dataclasses.dataclass
+class MultimodalResultEntry:
+    val_score: float
+    modality_ids: list
+    representations: list
+    fusion_method: str
+    representation_params: list
+    train_score: float
+    fusion_params: list
diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py
index cfd313eb563..3909b51ff98 100644
--- a/src/main/python/systemds/scuro/drsearch/operator_registry.py
+++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -64,6 +64,18 @@ def get_context_operators(self):
     def get_fusion_operators(self):
         return self._fusion_operators
 
+    def get_representation_by_name(self, representation_name, modality_type):
+        for representation in self._context_operators:
+            if representation.__name__ == representation_name:
+                return representation, True
+
+        if modality_type is not None:
+            for representation in self._representations[modality_type]:
+                if representation.__name__ == representation_name:
+                    return representation, False
+
+        return None, False
+
 
 def register_representation(modalities: Union[ModalityType, List[ModalityType]]):
     """
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 24839088df8..99093a8faa6 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -18,7 +18,13 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from build.lib.systemds.scuro.representations.aggregated_representation import AggregatedRepresentation
+from dataclasses import dataclass
+
+from systemds.scuro.representations.window_aggregation import WindowAggregation
+
+from build.lib.systemds.scuro.representations.aggregated_representation import (
+    AggregatedRepresentation,
+)
 from systemds.scuro import ModalityType, Aggregation
 from systemds.scuro.drsearch.operator_registry import Registry
 from systemds.scuro.utils.schema_helpers import get_shape
@@ -28,92 +34,130 @@ class UnimodalOptimizer:
     def __init__(self, modalities, tasks):
         self.modalities = modalities
         self.tasks = tasks
-        
+
         self.operator_registry = Registry()
         self.operator_performance = {}
 
         for modality in self.modalities:
             self.operator_performance[modality.modality_id] = {}
             for task in tasks:
-                self.operator_performance[modality.modality_id][task.model.name] = UnimodalResults(modality.modality_id, task.name)
-        
-    
+                self.operator_performance[modality.modality_id][task.model.name] = (
+                    UnimodalResults(modality.modality_id, task.name)
+                )
+
     def get_k_best_results(self, modality, k, task):
         """
         Get the k best results for the given modality
         :param modality: modality to get the best results for
         :param k: number of best results
         """
-        
-        results = self.operator_performance[modality.modality_id][task.model.name].get_k_best_results(k)
+
+        results = sorted(
+            self.operator_performance[modality.modality_id][task.model.name].results,
+            key=lambda x: x.val_score,
+            reverse=True,
+        )[:k]
 
         return results
-    
+
     def optimize(self):
         for modality in self.modalities:
             context_operators = self.operator_registry.get_context_operators()
-            
+
             for context_operator in context_operators:
                 context_representation = None
                 if modality.modality_type != ModalityType.TEXT:
                     con_op = context_operator()
                     context_representation = modality.context(con_op)
-                    self.evaluate(context_representation, [context_operator.__name__], [con_op.parameters])
-                
-                modality_specific_operators = self.operator_registry.get_representations(modality.modality_type)
+                    self.evaluate(context_representation, [con_op])
+
+                modality_specific_operators = (
+                    self.operator_registry.get_representations(modality.modality_type)
+                )
                 for modality_specific_operator in modality_specific_operators:
                     mod_context = None
                     mod_op = modality_specific_operator()
                     if context_representation is not None:
-                        mod_context = context_representation.apply_representation(mod_op)
-                        self.evaluate(mod_context, [context_operator.__name__, modality_specific_operator.__name__], [con_op.parameters, mod_op.parameters])
-                    
-                    
+                        mod_context = context_representation.apply_representation(
+                            mod_op
+                        )
+                        self.evaluate(mod_context, [con_op, mod_op])
+
                     mod = modality.apply_representation(mod_op)
-                    self.evaluate(mod, [modality_specific_operator.__name__],
-                                  [mod_op.parameters])
-                    
+                    self.evaluate(mod, [mod_op])
+
                     for context_operator_after in context_operators:
                         con_op_after = context_operator_after()
                         if mod_context is not None:
                             mod_context = mod_context.context(con_op_after)
-                            self.evaluate(mod_context,
-                                          [context_operator.__name__, modality_specific_operator.__name__, context_operator_after.__name__],
-                                          [con_op.parameters, mod_op.parameters, con_op_after.parameters])
-                        
+                            self.evaluate(mod_context, [con_op, mod_op, con_op_after])
+
                         mod = mod.context(con_op_after)
-                        self.evaluate(mod, [modality_specific_operator.__name__, context_operator_after.__name__],
-                                      [mod_op.parameters, con_op_after.parameters])
-    
-    def evaluate(self, modality, representation_names, params):
+                        self.evaluate(mod, [mod_op, con_op_after])
+
+    def evaluate(self, modality, representations):
         for task in self.tasks:
             if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
                 for aggregation in Aggregation().get_aggregation_functions():
                     # padding should not be necessary here
-                    agg_operator = AggregatedRepresentation(Aggregation(aggregation, False))
+                    agg_operator = AggregatedRepresentation(
+                        Aggregation(aggregation, False)
+                    )
                     agg_modality = agg_operator.transform(modality)
-                    
+
                     scores = task.run(agg_modality.data)
-                    rep_names = representation_names.copy()
-                    rep_names.append(agg_operator.name)
-                    
-                    rep_params = params.copy()
-                    rep_params.append(agg_operator.parameters)
-                    self.operator_performance[modality.modality_id][task.model.name].add_result(scores, rep_names, rep_params)
+                    reps = representations.copy()
+                    reps.append(agg_operator)
+
+                    self.operator_performance[modality.modality_id][
+                        task.model.name
+                    ].add_result(scores, reps)
             else:
                 scores = task.run(modality.data)
-                self.operator_performance[modality.modality_id][task.model.name].add_result(scores, representation_names, params)
-                
-                    
+                self.operator_performance[modality.modality_id][
+                    task.model.name
+                ].add_result(scores, representations)
+
+
 class UnimodalResults:
     def __init__(self, modality_id, task_name):
         self.modality_id = modality_id
         self.task_name = task_name
-        self.results = {'representations': [], 'params': [], 'train_score': [], 'val_score':[]}
-    
-    def add_result(self, scores, representations, params):
-        self.results['representations'].append(representations)
-        self.results['params'].append([param.copy() if param is not None else param for param in params ])
-        self.results['train_score'].append(scores[0])
-        self.results['val_score'].append(scores[1])
-    
\ No newline at end of file
+        self.results = []
+
+    def add_result(self, scores, representations):
+        parameters = []
+        representation_names = []
+
+        for rep in representations:
+            representation_names.append(type(rep).__name__)
+            if isinstance(rep, AggregatedRepresentation):
+                parameters.append(rep.parameters)
+                continue
+
+            params = {}
+            for param in rep.parameters.keys():
+                params[param] = getattr(rep, param)
+
+            if isinstance(rep, WindowAggregation):
+                params["aggregation_function"] = (
+                    rep.aggregation_function.aggregation_function_name
+                )
+
+            parameters.append(params)
+
+        entry = ResultEntry(
+            representations=representation_names,
+            params=parameters,
+            train_score=scores[0],
+            val_score=scores[1],
+        )
+        self.results.append(entry)
+
+
+@dataclass
+class ResultEntry:
+    val_score: float
+    representations: list
+    params: list
+    train_score: float

From 940b232fcefc55c2ab255ad0dec2e7b814ec16cd Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Tue, 29 Jul 2025 11:22:44 +0200
Subject: [PATCH 04/19] improve optimization results

---
 .../scuro/drsearch/multimodal_optimizer.py    | 110 +++++++++++++-----
 .../scuro/drsearch/unimodal_optimizer.py      |  49 +++++---
 2 files changed, 108 insertions(+), 51 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 6f064f896c5..74901f6204b 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -21,38 +21,73 @@ def __init__(self, modalities, unimodal_optimizer, tasks, k=2):
         self.operator_registry = Registry()
         self.optimization_results = {}
 
+        for modality in self.modalities:
+            self.optimization_results[modality.modality_id] = {}
+            for task in tasks:
+                self.optimization_results[modality.modality_id][task.model.name] = (
+                    MultimodalResults(modality, task.name)
+                )
+
     def optimize(self):
         for task in self.tasks:
             for modality in self.modalities:
                 representations = self.k_best_modalities[task][modality.modality_id]
-                applied_representations = []
-                for i in range(0, len(representations)):
-                    applied_representation = modality
-                    for j, rep in enumerate(representations[i].representations):
-                        representation, is_context = (
-                            self.operator_registry.get_representation_by_name(
-                                rep, modality.modality_type
-                            )
+                applied_representations = self.extract_representations(
+                    representations, modality
+                )
+                combined_representations = []
+                for i in range(1, len(applied_representations)):
+                    for fusion_method in self.operator_registry.get_fusion_operators():
+                        combined = applied_representations[i - 1].combine(
+                            applied_representations[i], fusion_method()
+                        )
+                        self.evaluate(
+                            task,
+                            combined,
+                            [i - 1, i],
+                            fusion_method,
+                            [modality.modality_id],
                         )
-                        if representation is None:
-                            if rep == AggregatedRepresentation.__name__:
-                                representation = AggregatedRepresentation(Aggregation())
-                        else:
-                            representation = representation()
-                        representation.set_parameters(representations[i].params[j])
-                        if is_context:
-                            applied_representation = applied_representation.context(
-                                representation
+                        if not fusion_method().commutative:
+                            combined_comm = applied_representations[i].combine(
+                                applied_representations[i - 1], fusion_method()
                             )
-                        else:
-                            applied_representation = (
-                                applied_representation.apply_representation(
-                                    representation
-                                )
+                            self.evaluate(
+                                task,
+                                combined_comm,
+                                [i, i - 1],
+                                fusion_method,
+                                [modality.modality_id],
                             )
-                    applied_representations.append(applied_representation)
 
-    def evaluate(self, task, modality, representation_names, params):
+    def extract_representations(self, representations, modality):
+        applied_representations = []
+        for i in range(0, len(representations)):
+            applied_representation = modality
+            for j, rep in enumerate(representations[i].representations):
+                representation, is_context = (
+                    self.operator_registry.get_representation_by_name(
+                        rep, modality.modality_type
+                    )
+                )
+                if representation is None:
+                    if rep == AggregatedRepresentation.__name__:
+                        representation = AggregatedRepresentation(Aggregation())
+                else:
+                    representation = representation()
+                representation.set_parameters(representations[i].params[j])
+                if is_context:
+                    applied_representation = applied_representation.context(
+                        representation
+                    )
+                else:
+                    applied_representation = (
+                        applied_representation.apply_representation(representation)
+                    )
+            applied_representations.append(applied_representation)
+        return applied_representations
+
+    def evaluate(self, task, modality, representations, fusion, modality_ids):
         if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
             for aggregation in Aggregation().get_aggregation_functions():
                 # padding should not be necessary here
@@ -60,18 +95,16 @@ def evaluate(self, task, modality, representation_names, params):
                 agg_modality = agg_operator.transform(modality)
 
                 scores = task.run(agg_modality.data)
-                rep_names = representation_names.copy()
-                rep_names.append(agg_operator.name)
+                reps = representations.copy()
+                reps.append(agg_operator)
 
-                rep_params = params.copy()
-                rep_params.append(agg_operator.parameters)
                 self.optimization_results[modality.modality_id][
                     task.model.name
-                ].add_result(scores, rep_names, rep_params)
+                ].add_result(scores, reps, fusion, modality_ids, task)
         else:
             scores = task.run(modality.data)
             self.optimization_results[modality.modality_id][task.model.name].add_result(
-                scores, representation_names, params
+                scores, representations, fusion, modality_ids, task
             )
 
     def extract_k_best_modalities_per_task(self):
@@ -91,6 +124,20 @@ def __init__(self, modality, task):
 
         self.results = []
 
+    def add_result(
+        self, scores, best_representation_idx, fusion_method, modality_ids, task
+    ):
+
+        entry = MultimodalResultEntry(
+            representations=best_representation_idx,
+            train_score=scores[0],
+            val_score=scores[1],
+            fusion_method=fusion_method.__name__,
+            modality_ids=modality_ids,
+            task=task,
+        )
+        self.results.append(entry)
+
 
 @dataclasses.dataclass
 class MultimodalResultEntry:
@@ -98,6 +145,5 @@ class MultimodalResultEntry:
     modality_ids: list
     representations: list
     fusion_method: str
-    representation_params: list
     train_score: float
-    fusion_params: list
+    task: str
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 99093a8faa6..3f6671595e6 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -36,14 +36,14 @@ def __init__(self, modalities, tasks):
         self.tasks = tasks
 
         self.operator_registry = Registry()
-        self.operator_performance = {}
+        self.operator_performance = UnimodalResults(modalities, tasks)
 
-        for modality in self.modalities:
-            self.operator_performance[modality.modality_id] = {}
-            for task in tasks:
-                self.operator_performance[modality.modality_id][task.model.name] = (
-                    UnimodalResults(modality.modality_id, task.name)
-                )
+        # for modality in self.modalities:
+        #     self.operator_performance[modality.modality_id] = {}
+        #     for task in tasks:
+        #         self.operator_performance[modality.modality_id][task.model.name] = (
+        #             UnimodalResults(modality.modality_id, task.name)
+        #         )
 
     def get_k_best_results(self, modality, k, task):
         """
@@ -109,23 +109,28 @@ def evaluate(self, modality, representations):
                     reps = representations.copy()
                     reps.append(agg_operator)
 
-                    self.operator_performance[modality.modality_id][
-                        task.model.name
-                    ].add_result(scores, reps)
+                    self.operator_performance.add_result(
+                        scores, reps, modality.modality_id, task.model.name
+                    )
             else:
                 scores = task.run(modality.data)
-                self.operator_performance[modality.modality_id][
-                    task.model.name
-                ].add_result(scores, representations)
+                self.operator_performance.add_result(
+                    scores, representations, modality.modality_id, task.model.name
+                )
 
 
 class UnimodalResults:
-    def __init__(self, modality_id, task_name):
-        self.modality_id = modality_id
-        self.task_name = task_name
-        self.results = []
+    def __init__(self, modalities, tasks):
+        self.modality_ids = [modality.modality_id for modality in modalities]
+        self.task_names = [task.model.name for task in tasks]
+        self.results = {}
+
+        for modality in self.modality_ids:
+            self.results[modality] = {}
+            for task_name in self.task_names:
+                self.results[modality][task_name] = []
 
-    def add_result(self, scores, representations):
+    def add_result(self, scores, representations, modality_id, task_name):
         parameters = []
         representation_names = []
 
@@ -152,7 +157,13 @@ def add_result(self, scores, representations):
             train_score=scores[0],
             val_score=scores[1],
         )
-        self.results.append(entry)
+        self.results[modality_id][task_name].append(entry)
+
+    def print_results(self):
+        for modality in self.modality_ids:
+            for task_name in self.task_names:
+                for entry in self.results[modality][task_name]:
+                    print(f"{modality}_{task_name}: {entry}")
 
 
 @dataclass

From fb553d4d7fbf8d0af2e51b821efa04c4680cd71c Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 30 Jul 2025 09:35:42 +0200
Subject: [PATCH 05/19] add deterministic random seed generator

---
 .../scuro/drsearch/unimodal_optimizer.py      | 71 +++++++++++++------
 .../systemds/scuro/modality/modality.py       | 14 ++++
 .../systemds/scuro/modality/transformed.py    |  3 +-
 .../scuro/modality/unimodal_modality.py       |  2 +-
 .../scuro/representations/aggregate.py        |  1 +
 .../systemds/scuro/utils/static_variables.py  |  7 ++
 6 files changed, 76 insertions(+), 22 deletions(-)
 create mode 100644 src/main/python/systemds/scuro/utils/static_variables.py

diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 3f6671595e6..4d595fac10c 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -31,19 +31,20 @@
 
 
 class UnimodalOptimizer:
-    def __init__(self, modalities, tasks):
+    def __init__(self, modalities, tasks, debug=True):
         self.modalities = modalities
         self.tasks = tasks
 
         self.operator_registry = Registry()
-        self.operator_performance = UnimodalResults(modalities, tasks)
+        self.operator_performance = UnimodalResults(modalities, tasks, debug)
 
-        # for modality in self.modalities:
-        #     self.operator_performance[modality.modality_id] = {}
-        #     for task in tasks:
-        #         self.operator_performance[modality.modality_id][task.model.name] = (
-        #             UnimodalResults(modality.modality_id, task.name)
-        #         )
+        self._tasks_require_same_dims = True
+        self.expected_dimensions = None
+
+        for i in range(1, len(self.tasks)):
+            self.expected_dimensions = tasks[i].expected_dim
+            if tasks[i - 1].expected_dim != tasks[i].expected_dim:
+                self._tasks_require_same_dims = False
 
     def get_k_best_results(self, modality, k, task):
         """
@@ -53,7 +54,7 @@ def get_k_best_results(self, modality, k, task):
         """
 
         results = sorted(
-            self.operator_performance[modality.modality_id][task.model.name].results,
+            self.operator_performance.results[modality.modality_id][task.model.name],
             key=lambda x: x.val_score,
             reverse=True,
         )[:k]
@@ -96,34 +97,61 @@ def optimize(self):
                         self.evaluate(mod, [mod_op, con_op_after])
 
     def evaluate(self, modality, representations):
-        for task in self.tasks:
-            if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
+        if self._tasks_require_same_dims:
+            if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
                 for aggregation in Aggregation().get_aggregation_functions():
-                    # padding should not be necessary here
                     agg_operator = AggregatedRepresentation(
                         Aggregation(aggregation, False)
                     )
                     agg_modality = agg_operator.transform(modality)
 
-                    scores = task.run(agg_modality.data)
                     reps = representations.copy()
                     reps.append(agg_operator)
+                    agg_modality.pad()
+                    for task in self.tasks:
+                        scores = task.run(agg_modality.data)
 
+                        self.operator_performance.add_result(
+                            scores, reps, modality.modality_id, task.model.name
+                        )
+            else:
+                modality.pad()
+                for task in self.tasks:
+                    scores = task.run(modality.data)
                     self.operator_performance.add_result(
-                        scores, reps, modality.modality_id, task.model.name
+                        scores, representations, modality.modality_id, task.model.name
+                    )
+        else:
+            for task in self.tasks:
+                if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
+                    for aggregation in Aggregation().get_aggregation_functions():
+                        agg_operator = AggregatedRepresentation(
+                            Aggregation(aggregation, False)
+                        )
+                        agg_modality = agg_operator.transform(modality)
+
+                        reps = representations.copy()
+                        reps.append(agg_operator)
+                        modality.pad()
+                        scores = task.run(agg_modality.data)
+
+                        self.operator_performance.add_result(
+                            scores, reps, modality.modality_id, task.model.name
+                        )
+                else:
+                    modality.pad()
+                    scores = task.run(modality.data)
+                    self.operator_performance.add_result(
+                        scores, representations, modality.modality_id, task.model.name
                     )
-            else:
-                scores = task.run(modality.data)
-                self.operator_performance.add_result(
-                    scores, representations, modality.modality_id, task.model.name
-                )
 
 
 class UnimodalResults:
-    def __init__(self, modalities, tasks):
+    def __init__(self, modalities, tasks, debug=False):
         self.modality_ids = [modality.modality_id for modality in modalities]
         self.task_names = [task.model.name for task in tasks]
         self.results = {}
+        self.debug = debug
 
         for modality in self.modality_ids:
             self.results[modality] = {}
@@ -159,6 +187,9 @@ def add_result(self, scores, representations, modality_id, task_name):
         )
         self.results[modality_id][task_name].append(entry)
 
+        if self.debug:
+            print(f"{modality_id}_{task_name}: {entry}")
+
     def print_results(self):
         for modality in self.modality_ids:
             for task_name in self.task_names:
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 87d5b5ee4e4..3b1076b3252 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -120,6 +120,20 @@ def flatten(self, padding=True):
         self.data = np.array(self.data)
         return self
 
+    def pad(self, value=0):
+        try:
+            result = np.array(self.data)
+        except:
+            maxlen = max([len(seq) for seq in self.data])
+
+            result = np.full((len(self.data), maxlen), value, dtype=self.data_type)
+
+            for i, seq in enumerate(self.data):
+                data = seq[:maxlen]
+                result[i, : len(data)] = data
+
+        self.data = result
+
     def get_data_layout(self):
         if self.has_metadata():
             return list(self.metadata.values())[0]["data_layout"]["representation"]
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 362764d21e9..1a292b495b0 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -20,6 +20,7 @@
 # -------------------------------------------------------------
 from functools import reduce
 from operator import or_
+from typing import Union, List
 
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.modality.joined import JoinedModality
@@ -87,7 +88,7 @@ def apply_representation(self, representation):
         new_modality.update_metadata()
         return new_modality
 
-    def combine(self, other, fusion_method):
+    def combine(self, other: Union[Modality, List[Modality]], fusion_method):
         """
         Combines two or more modalities with each other using a dedicated fusion method
         :param other: The modality to be combined
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index c0ee70557c5..fb117aa32e8 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -117,7 +117,7 @@ def apply_representation(self, representation):
                 new_modality.data.extend(transformed_chunk.data)
                 new_modality.metadata.update(transformed_chunk.metadata)
         else:
-            if not self.data:
+            if not self.has_data():
                 self.extract_raw_data()
             new_modality = representation.transform(self)
 
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py
index 756e6271ea5..506d16f8d08 100644
--- a/src/main/python/systemds/scuro/representations/aggregate.py
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -58,6 +58,7 @@ def __init__(self, aggregation_function="mean", pad_modality=False, params=None)
         self._aggregation_func = self._aggregation_function[aggregation_function]
         self.name = "Aggregation"
         self.pad_modality = pad_modality
+        self.aggregation_function_name = aggregation_function
 
         self.parameters = {
             "aggregation_function": aggregation_function,
diff --git a/src/main/python/systemds/scuro/utils/static_variables.py b/src/main/python/systemds/scuro/utils/static_variables.py
new file mode 100644
index 00000000000..b1733387160
--- /dev/null
+++ b/src/main/python/systemds/scuro/utils/static_variables.py
@@ -0,0 +1,7 @@
+import numpy as np
+
+global_rng = np.random.default_rng(42)
+
+
+def get_seed():
+    return global_rng.integers(0, 1024)

From 098eb2857c34d688a861f8695d5674b85914ac25 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 30 Jul 2025 09:46:37 +0200
Subject: [PATCH 06/19] aggeregation function as hyperparameter

---
 .../scuro/drsearch/unimodal_optimizer.py      | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 4d595fac10c..8f450005b9c 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -99,21 +99,21 @@ def optimize(self):
     def evaluate(self, modality, representations):
         if self._tasks_require_same_dims:
             if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
-                for aggregation in Aggregation().get_aggregation_functions():
-                    agg_operator = AggregatedRepresentation(
-                        Aggregation(aggregation, False)
-                    )
-                    agg_modality = agg_operator.transform(modality)
+                # for aggregation in Aggregation().get_aggregation_functions():
+                agg_operator = AggregatedRepresentation(
+                    Aggregation()
+                )
+                agg_modality = agg_operator.transform(modality)
 
-                    reps = representations.copy()
-                    reps.append(agg_operator)
-                    agg_modality.pad()
-                    for task in self.tasks:
-                        scores = task.run(agg_modality.data)
+                reps = representations.copy()
+                reps.append(agg_operator)
+                agg_modality.pad()
+                for task in self.tasks:
+                    scores = task.run(agg_modality.data)
 
-                        self.operator_performance.add_result(
-                            scores, reps, modality.modality_id, task.model.name
-                        )
+                    self.operator_performance.add_result(
+                        scores, reps, modality.modality_id, task.model.name
+                    )
             else:
                 modality.pad()
                 for task in self.tasks:
@@ -124,20 +124,20 @@ def evaluate(self, modality, representations):
         else:
             for task in self.tasks:
                 if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
-                    for aggregation in Aggregation().get_aggregation_functions():
-                        agg_operator = AggregatedRepresentation(
-                            Aggregation(aggregation, False)
-                        )
-                        agg_modality = agg_operator.transform(modality)
+                    # for aggregation in Aggregation().get_aggregation_functions():
+                    agg_operator = AggregatedRepresentation(
+                        Aggregation()
+                    )
+                    agg_modality = agg_operator.transform(modality)
 
-                        reps = representations.copy()
-                        reps.append(agg_operator)
-                        modality.pad()
-                        scores = task.run(agg_modality.data)
+                    reps = representations.copy()
+                    reps.append(agg_operator)
+                    modality.pad()
+                    scores = task.run(agg_modality.data)
 
-                        self.operator_performance.add_result(
-                            scores, reps, modality.modality_id, task.model.name
-                        )
+                    self.operator_performance.add_result(
+                        scores, reps, modality.modality_id, task.model.name
+                    )
                 else:
                     modality.pad()
                     scores = task.run(modality.data)

From 28a001127c327422b28d83de316611c8dbd6d624 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 30 Jul 2025 14:55:19 +0200
Subject: [PATCH 07/19] parallelize unimodal optimization

---
 .../scuro/drsearch/multimodal_optimizer.py    |  11 ++
 .../scuro/drsearch/unimodal_optimizer.py      | 127 ++++++++++++------
 .../systemds/scuro/modality/modality.py       |  12 ++
 .../python/systemds/scuro/modality/type.py    |  16 ++-
 .../scuro/representations/aggregate.py        |   4 +-
 .../scuro/representations/representation.py   |   2 +-
 .../representations/window_aggregation.py     |   2 +-
 7 files changed, 121 insertions(+), 53 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 74901f6204b..8dd2273caca 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -20,6 +20,7 @@ def __init__(self, modalities, unimodal_optimizer, tasks, k=2):
         self.extract_k_best_modalities_per_task()
         self.operator_registry = Registry()
         self.optimization_results = {}
+        self.cache = {}
 
         for modality in self.modalities:
             self.optimization_results[modality.modality_id] = {}
@@ -38,6 +39,13 @@ def optimize(self):
                 combined_representations = []
                 for i in range(1, len(applied_representations)):
                     for fusion_method in self.operator_registry.get_fusion_operators():
+                        if (
+                            fusion_method().needs_alignment
+                            and not applied_representations[i - 1].is_aligned(
+                                applied_representations[i]
+                            )
+                        ):
+                            continue
                         combined = applied_representations[i - 1].combine(
                             applied_representations[i], fusion_method()
                         )
@@ -107,6 +115,9 @@ def evaluate(self, task, modality, representations, fusion, modality_ids):
                 scores, representations, fusion, modality_ids, task
             )
 
+    def add_to_cache(self, result_idx, combined_modality):
+        self.cache[result_idx] = combined_modality
+
     def extract_k_best_modalities_per_task(self):
         self.k_best_modalities = {}
         for task in self.tasks:
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 8f450005b9c..589b1932913 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -18,8 +18,11 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import pickle
+from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass
 
+import multiprocessing as mp
 from systemds.scuro.representations.window_aggregation import WindowAggregation
 
 from build.lib.systemds.scuro.representations.aggregated_representation import (
@@ -39,9 +42,9 @@ def __init__(self, modalities, tasks, debug=True):
         self.operator_performance = UnimodalResults(modalities, tasks, debug)
 
         self._tasks_require_same_dims = True
-        self.expected_dimensions = None
+        self.expected_dimensions = tasks[0].expected_dim
 
-        for i in range(1, len(self.tasks)):
+        for i in range(1, len(tasks)):
             self.expected_dimensions = tasks[i].expected_dim
             if tasks[i - 1].expected_dim != tasks[i].expected_dim:
                 self._tasks_require_same_dims = False
@@ -61,73 +64,111 @@ def get_k_best_results(self, modality, k, task):
 
         return results
 
-    def optimize(self):
+    def optimize_parallel(self, n_workers=None):
+        if n_workers is None:
+            n_workers = min(len(self.modalities), mp.cpu_count())
+
+        with ProcessPoolExecutor(max_workers=n_workers) as executor:
+            future_to_modality = {
+                executor.submit(self._process_modality, modality): modality
+                for modality in self.modalities
+            }
+
+            for future in as_completed(future_to_modality):
+                modality = future_to_modality[future]
+                # try:
+                results = future.result()
+                self._merge_results(results)
+                # except Exception as exc:
+                #     print(f'Modality {modality.modality_id} generated an exception: {exc}')
+
+    def optimize(self, n_workers=None):
         for modality in self.modalities:
-            context_operators = self.operator_registry.get_context_operators()
+            self._process_modality(modality)
 
-            for context_operator in context_operators:
-                context_representation = None
-                if modality.modality_type != ModalityType.TEXT:
-                    con_op = context_operator()
-                    context_representation = modality.context(con_op)
-                    self.evaluate(context_representation, [con_op])
-
-                modality_specific_operators = (
-                    self.operator_registry.get_representations(modality.modality_type)
-                )
-                for modality_specific_operator in modality_specific_operators:
-                    mod_context = None
-                    mod_op = modality_specific_operator()
-                    if context_representation is not None:
-                        mod_context = context_representation.apply_representation(
-                            mod_op
+    def _process_modality(self, modality):
+        local_results = UnimodalResults(
+            modalities=[modality], tasks=self.tasks, debug=False
+        )
+        context_operators = self.operator_registry.get_context_operators()
+
+        for context_operator in context_operators:
+            context_representation = None
+            if modality.modality_type != ModalityType.TEXT:
+                con_op = context_operator()
+                print("context_operator ", con_op.name)
+                context_representation = modality.context(con_op)
+                self._evaluate_local(context_representation, [con_op], local_results)
+
+            modality_specific_operators = self.operator_registry.get_representations(
+                modality.modality_type
+            )
+            for modality_specific_operator in modality_specific_operators:
+                mod_context = None
+                mod_op = modality_specific_operator()
+                if context_representation is not None:
+                    print("before context" + mod_op.name)
+                    mod_context = context_representation.apply_representation(mod_op)
+                    print("after context" + mod_op.name)
+                    self._evaluate_local(mod_context, [con_op, mod_op], local_results)
+
+                print("before " + mod_op.name)
+                mod = modality.apply_representation(mod_op)
+                print("after " + mod_op.name)
+                self._evaluate_local(mod, [mod_op], local_results)
+
+                for context_operator_after in context_operators:
+                    con_op_after = context_operator_after()
+                    if mod_context is not None:
+                        mod_context = mod_context.context(con_op_after)
+                        self._evaluate_local(
+                            mod_context, [con_op, mod_op, con_op_after], local_results
                         )
-                        self.evaluate(mod_context, [con_op, mod_op])
 
-                    mod = modality.apply_representation(mod_op)
-                    self.evaluate(mod, [mod_op])
+                    mod = mod.context(con_op_after)
+                    self._evaluate_local(mod, [mod_op, con_op_after], local_results)
 
-                    for context_operator_after in context_operators:
-                        con_op_after = context_operator_after()
-                        if mod_context is not None:
-                            mod_context = mod_context.context(con_op_after)
-                            self.evaluate(mod_context, [con_op, mod_op, con_op_after])
+            return local_results
 
-                        mod = mod.context(con_op_after)
-                        self.evaluate(mod, [mod_op, con_op_after])
+    def _merge_results(self, local_results):
+        """Merge local results into the main results"""
+        for modality_id in local_results.results:
+            for task_name in local_results.results[modality_id]:
+                self.operator_performance.results[modality_id][task_name].extend(
+                    local_results.results[modality_id][task_name]
+                )
 
-    def evaluate(self, modality, representations):
+    def _evaluate_local(self, modality, representations, local_results):
         if self._tasks_require_same_dims:
             if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
+                print("aggregate")
                 # for aggregation in Aggregation().get_aggregation_functions():
-                agg_operator = AggregatedRepresentation(
-                    Aggregation()
-                )
+                agg_operator = AggregatedRepresentation(Aggregation())
                 agg_modality = agg_operator.transform(modality)
-
+                print("aggregated")
                 reps = representations.copy()
                 reps.append(agg_operator)
                 agg_modality.pad()
                 for task in self.tasks:
                     scores = task.run(agg_modality.data)
 
-                    self.operator_performance.add_result(
+                    local_results.add_result(
                         scores, reps, modality.modality_id, task.model.name
                     )
             else:
+                print("padd")
                 modality.pad()
+                print("done pad")
                 for task in self.tasks:
                     scores = task.run(modality.data)
-                    self.operator_performance.add_result(
+                    local_results.add_result(
                         scores, representations, modality.modality_id, task.model.name
                     )
         else:
             for task in self.tasks:
                 if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
                     # for aggregation in Aggregation().get_aggregation_functions():
-                    agg_operator = AggregatedRepresentation(
-                        Aggregation()
-                    )
+                    agg_operator = AggregatedRepresentation(Aggregation())
                     agg_modality = agg_operator.transform(modality)
 
                     reps = representations.copy()
@@ -135,13 +176,13 @@ def evaluate(self, modality, representations):
                     modality.pad()
                     scores = task.run(agg_modality.data)
 
-                    self.operator_performance.add_result(
+                    local_results.add_result(
                         scores, reps, modality.modality_id, task.model.name
                     )
                 else:
                     modality.pad()
                     scores = task.run(modality.data)
-                    self.operator_performance.add_result(
+                    local_results.add_result(
                         scores, representations, modality.modality_id, task.model.name
                     )
 
@@ -169,7 +210,7 @@ def add_result(self, scores, representations, modality_id, task_name):
                 continue
 
             params = {}
-            for param in rep.parameters.keys():
+            for param in list(rep.parameters.keys()):
                 params[param] = getattr(rep, param)
 
             if isinstance(rep, WindowAggregation):
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 3b1076b3252..32e68eff086 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -145,3 +145,15 @@ def has_data(self):
 
     def has_metadata(self):
         return self.metadata is not None and self.metadata != {}
+
+    def is_aligned(self, other_modality):
+        aligned = True
+        for i in range(len(self.data)):
+            if (
+                list(self.metadata.values())[i]["data_layout"]["shape"]
+                != list(other_modality.metadata.values())[i]["data_layout"]["shape"]
+            ):
+                aligned = False
+                continue
+
+        return aligned
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index a479e07085d..c0baaf4c6aa 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -99,11 +99,15 @@ def update_base_metadata(cls, md, data, data_is_single_instance=True):
         dtype = np.nan
         shape = None
         if data_layout is DataLayout.SINGLE_LEVEL:
-            dtype = data.dtype
-            shape = data.shape
-        elif data_layout is DataLayout.NESTED_LEVEL:
-            shape = data[0].shape
             dtype = data[0].dtype
+            shape = data[0].shape
+        elif data_layout is DataLayout.NESTED_LEVEL:
+            if data_is_single_instance:
+                dtype = data.dtype
+                shape = data.shape
+            else:
+                shape = data[0].shape
+                dtype = data[0].dtype
 
         md["data_layout"].update(
             {"representation": data_layout, "type": dtype, "shape": shape}
@@ -241,9 +245,9 @@ def get_data_layout(cls, data, data_is_single_instance):
 
         if data_is_single_instance:
             if isinstance(data, list):
-                return DataLayout.NESTED_LEVEL
-            elif isinstance(data, np.ndarray):
                 return DataLayout.SINGLE_LEVEL
+            elif isinstance(data, np.ndarray):
+                return DataLayout.NESTED_LEVEL
 
         if isinstance(data[0], list):
             return DataLayout.NESTED_LEVEL
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py
index 506d16f8d08..1e73c81696d 100644
--- a/src/main/python/systemds/scuro/representations/aggregate.py
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -52,7 +52,7 @@ def __init__(self, aggregation_function="mean", pad_modality=False, params=None)
             aggregation_function = params["aggregation_function"]
             pad_modality = params["pad_modality"]
 
-        if aggregation_function not in self._aggregation_function.keys():
+        if aggregation_function not in list(self._aggregation_function.keys()):
             raise ValueError("Invalid aggregation function")
 
         self._aggregation_func = self._aggregation_function[aggregation_function]
@@ -101,4 +101,4 @@ def aggregate_instance(self, instance):
         return self._aggregation_func(instance)
 
     def get_aggregation_functions(self):
-        return self._aggregation_function.keys()
+        return list(self._aggregation_function.keys())
diff --git a/src/main/python/systemds/scuro/representations/representation.py b/src/main/python/systemds/scuro/representations/representation.py
index a9f283b6fe3..6137baf46dc 100644
--- a/src/main/python/systemds/scuro/representations/representation.py
+++ b/src/main/python/systemds/scuro/representations/representation.py
@@ -32,7 +32,7 @@ def parameters(self):
 
     def get_current_parameters(self):
         current_params = {}
-        for parameter in self.parameters.keys():
+        for parameter in list(self.parameters.keys()):
             current_params[parameter] = getattr(self, parameter)
         return current_params
 
diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
index bff63729c7b..773399eecdf 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -62,7 +62,7 @@ def execute(self, modality):
 
             windowed_data.append(windowed_instance)
 
-        return windowed_data
+        return np.array(windowed_data)
 
     def window_aggregate_single_level(self, instance, new_length):
         if isinstance(instance, str):

From f2760f13968455e95941d4d68ad583f8bf518f89 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 30 Jul 2025 15:55:17 +0200
Subject: [PATCH 08/19] print multimodal optimizations

---
 .../scuro/drsearch/multimodal_optimizer.py    | 108 +++++++++++++-----
 .../scuro/drsearch/unimodal_optimizer.py      |   9 --
 2 files changed, 79 insertions(+), 38 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 8dd2273caca..4a5a2ec79e5 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -11,7 +11,7 @@
 
 
 class MultimodalOptimizer:
-    def __init__(self, modalities, unimodal_optimizer, tasks, k=2):
+    def __init__(self, modalities, unimodal_optimizer, tasks, k=2, debug=True):
         self.k_best_modalities = None
         self.modalities = modalities
         self.unimodal_optimizer = unimodal_optimizer
@@ -19,20 +19,17 @@ def __init__(self, modalities, unimodal_optimizer, tasks, k=2):
         self.k = k
         self.extract_k_best_modalities_per_task()
         self.operator_registry = Registry()
-        self.optimization_results = {}
+        self.optimization_results = MultimodalResults(
+            modalities, tasks, debug, self.k_best_modalities
+        )
         self.cache = {}
 
-        for modality in self.modalities:
-            self.optimization_results[modality.modality_id] = {}
-            for task in tasks:
-                self.optimization_results[modality.modality_id][task.model.name] = (
-                    MultimodalResults(modality, task.name)
-                )
-
     def optimize(self):
         for task in self.tasks:
             for modality in self.modalities:
-                representations = self.k_best_modalities[task][modality.modality_id]
+                representations = self.k_best_modalities[task.model.name][
+                    modality.modality_id
+                ]
                 applied_representations = self.extract_representations(
                     representations, modality
                 )
@@ -54,7 +51,10 @@ def optimize(self):
                             combined,
                             [i - 1, i],
                             fusion_method,
-                            [modality.modality_id],
+                            [
+                                applied_representations[i - 1].modality_id,
+                                applied_representations[i].modality_id,
+                            ],
                         )
                         if not fusion_method().commutative:
                             combined_comm = applied_representations[i].combine(
@@ -65,7 +65,10 @@ def optimize(self):
                                 combined_comm,
                                 [i, i - 1],
                                 fusion_method,
-                                [modality.modality_id],
+                                [
+                                    applied_representations[i - 1].modality_id,
+                                    applied_representations[i].modality_id,
+                                ],
                             )
 
     def extract_representations(self, representations, modality):
@@ -106,13 +109,13 @@ def evaluate(self, task, modality, representations, fusion, modality_ids):
                 reps = representations.copy()
                 reps.append(agg_operator)
 
-                self.optimization_results[modality.modality_id][
-                    task.model.name
-                ].add_result(scores, reps, fusion, modality_ids, task)
+                self.optimization_results.add_result(
+                    scores, reps, [fusion], modality_ids, task.model.name
+                )
         else:
             scores = task.run(modality.data)
-            self.optimization_results[modality.modality_id][task.model.name].add_result(
-                scores, representations, fusion, modality_ids, task
+            self.optimization_results.add_result(
+                scores, representations, [fusion], modality_ids, task.model.name
             )
 
     def add_to_cache(self, result_idx, combined_modality):
@@ -121,33 +124,80 @@ def add_to_cache(self, result_idx, combined_modality):
     def extract_k_best_modalities_per_task(self):
         self.k_best_modalities = {}
         for task in self.tasks:
-            self.k_best_modalities[task] = {}
+            self.k_best_modalities[task.model.name] = {}
             for modality in self.modalities:
-                self.k_best_modalities[task][modality.modality_id] = (
+                self.k_best_modalities[task.model.name][modality.modality_id] = (
                     self.unimodal_optimizer.get_k_best_results(modality, self.k, task)
                 )
 
 
 class MultimodalResults:
-    def __init__(self, modality, task):
-        self.modality_id = modality.modality_id
-        self.task = task
-
-        self.results = []
+    def __init__(self, modalities, tasks, debug, k_best_modalities):
+        self.modality_ids = [modality.modality_id for modality in modalities]
+        self.task_names = [task.model.name for task in tasks]
+        self.results = {}
+        self.debug = debug
+        self.k_best_modalities = k_best_modalities
 
     def add_result(
-        self, scores, best_representation_idx, fusion_method, modality_ids, task
+        self, scores, best_representation_idx, fusion_methods, modality_ids, task_name
     ):
 
         entry = MultimodalResultEntry(
             representations=best_representation_idx,
             train_score=scores[0],
             val_score=scores[1],
-            fusion_method=fusion_method.__name__,
+            fusion_methods=[fusion_method.__name__ for fusion_method in fusion_methods],
             modality_ids=modality_ids,
-            task=task,
+            task=task_name,
         )
-        self.results.append(entry)
+
+        modality_id_strings = "_".join(list(map(str, modality_ids)))
+        if not modality_id_strings in self.results:
+            self.results[modality_id_strings] = {}
+            self.results[modality_id_strings][task_name] = []
+
+        self.results[modality_id_strings][task_name].append(entry)
+
+    def print_results(self):
+        for modality in self.results.keys():
+            for task_name in self.task_names:
+                for entry in self.results[modality][task_name]:
+                    reps = []
+                    for i, mod_idx in enumerate(entry.modality_ids):
+                        reps.append(
+                            self.k_best_modalities[task_name][mod_idx][
+                                entry.representations[i]
+                            ]
+                        )
+
+                    print(
+                        f"{modality}_{task_name}: "
+                        f"Validation score: {entry.val_score} - Training score: {entry.train_score}"
+                    )
+                    for i, rep in enumerate(reps):
+                        print(
+                            f"    Representation: {entry.modality_ids[i]} - {rep.representations}"
+                        )
+                        if i < len(reps) - 1:
+                            print(f"    Fusion: {entry.fusion_methods[i]} ")
+
+    def store_results(self):
+        for modality in self.results.keys():
+            for task_name in self.task_names:
+                for entry in self.results[modality][task_name]:
+                    reps = []
+                    for i, mod_idx in enumerate(entry.modality_ids):
+                        reps.append(
+                            self.k_best_modalities[task_name][mod_idx][
+                                entry.representations[i]
+                            ]
+                        )
+                    entry.representations = reps
+
+        import pickle
+
+        pickle.dump(self.results, open("multimodal_results.p", "wb"))
 
 
 @dataclasses.dataclass
@@ -155,6 +205,6 @@ class MultimodalResultEntry:
     val_score: float
     modality_ids: list
     representations: list
-    fusion_method: str
+    fusion_methods: list
     train_score: float
     task: str
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 589b1932913..e27b805ae54 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -96,7 +96,6 @@ def _process_modality(self, modality):
             context_representation = None
             if modality.modality_type != ModalityType.TEXT:
                 con_op = context_operator()
-                print("context_operator ", con_op.name)
                 context_representation = modality.context(con_op)
                 self._evaluate_local(context_representation, [con_op], local_results)
 
@@ -107,14 +106,10 @@ def _process_modality(self, modality):
                 mod_context = None
                 mod_op = modality_specific_operator()
                 if context_representation is not None:
-                    print("before context" + mod_op.name)
                     mod_context = context_representation.apply_representation(mod_op)
-                    print("after context" + mod_op.name)
                     self._evaluate_local(mod_context, [con_op, mod_op], local_results)
 
-                print("before " + mod_op.name)
                 mod = modality.apply_representation(mod_op)
-                print("after " + mod_op.name)
                 self._evaluate_local(mod, [mod_op], local_results)
 
                 for context_operator_after in context_operators:
@@ -141,11 +136,9 @@ def _merge_results(self, local_results):
     def _evaluate_local(self, modality, representations, local_results):
         if self._tasks_require_same_dims:
             if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
-                print("aggregate")
                 # for aggregation in Aggregation().get_aggregation_functions():
                 agg_operator = AggregatedRepresentation(Aggregation())
                 agg_modality = agg_operator.transform(modality)
-                print("aggregated")
                 reps = representations.copy()
                 reps.append(agg_operator)
                 agg_modality.pad()
@@ -156,9 +149,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         scores, reps, modality.modality_id, task.model.name
                     )
             else:
-                print("padd")
                 modality.pad()
-                print("done pad")
                 for task in self.tasks:
                     scores = task.run(modality.data)
                     local_results.add_result(

From 0a4f6cade8e8b73ff3fb96aa8f0a1627beccfe24 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Tue, 5 Aug 2025 14:50:02 +0200
Subject: [PATCH 09/19] refine unimodal optimizer

---
 .../scuro/drsearch/multimodal_optimizer.py    | 107 ++++++++++--------
 .../scuro/drsearch/unimodal_optimizer.py      |  20 +++-
 .../representations/window_aggregation.py     |   4 +-
 3 files changed, 79 insertions(+), 52 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 4a5a2ec79e5..e6d7abd25c3 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -26,50 +26,54 @@ def __init__(self, modalities, unimodal_optimizer, tasks, k=2, debug=True):
 
     def optimize(self):
         for task in self.tasks:
-            for modality in self.modalities:
-                representations = self.k_best_modalities[task.model.name][
-                    modality.modality_id
-                ]
-                applied_representations = self.extract_representations(
-                    representations, modality
-                )
-                combined_representations = []
-                for i in range(1, len(applied_representations)):
-                    for fusion_method in self.operator_registry.get_fusion_operators():
-                        if (
-                            fusion_method().needs_alignment
-                            and not applied_representations[i - 1].is_aligned(
-                                applied_representations[i]
-                            )
-                        ):
-                            continue
-                        combined = applied_representations[i - 1].combine(
-                            applied_representations[i], fusion_method()
+            self.optimize_intramodal_representations(task)
+            self.optimize_intermodal_representations(task)
+
+    def optimize_intramodal_representations(self, task):
+        for modality in self.modalities:
+            representations = self.k_best_modalities[task.model.name][
+                modality.modality_id
+            ]
+            applied_representations = self.extract_representations(
+                representations, modality
+            )
+
+            for i in range(1, len(applied_representations)):
+                for fusion_method in self.operator_registry.get_fusion_operators():
+                    if fusion_method().needs_alignment and not applied_representations[
+                        i - 1
+                    ].is_aligned(applied_representations[i]):
+                        continue
+                    combined = applied_representations[i - 1].combine(
+                        applied_representations[i], fusion_method()
+                    )
+                    self.evaluate(
+                        task,
+                        combined,
+                        [i - 1, i],
+                        fusion_method,
+                        [
+                            applied_representations[i - 1].modality_id,
+                            applied_representations[i].modality_id,
+                        ],
+                    )
+                    if not fusion_method().commutative:
+                        combined_comm = applied_representations[i].combine(
+                            applied_representations[i - 1], fusion_method()
                         )
                         self.evaluate(
                             task,
-                            combined,
-                            [i - 1, i],
+                            combined_comm,
+                            [i, i - 1],
                             fusion_method,
                             [
                                 applied_representations[i - 1].modality_id,
                                 applied_representations[i].modality_id,
                             ],
                         )
-                        if not fusion_method().commutative:
-                            combined_comm = applied_representations[i].combine(
-                                applied_representations[i - 1], fusion_method()
-                            )
-                            self.evaluate(
-                                task,
-                                combined_comm,
-                                [i, i - 1],
-                                fusion_method,
-                                [
-                                    applied_representations[i - 1].modality_id,
-                                    applied_representations[i].modality_id,
-                                ],
-                            )
+
+    def optimize_intermodal_representations(self, task):
+        pass
 
     def extract_representations(self, representations, modality):
         applied_representations = []
@@ -139,6 +143,9 @@ def __init__(self, modalities, tasks, debug, k_best_modalities):
         self.debug = debug
         self.k_best_modalities = k_best_modalities
 
+        for task in tasks:
+            self.results[task.model.name] = {}
+
     def add_result(
         self, scores, best_representation_idx, fusion_methods, modality_ids, task_name
     ):
@@ -153,16 +160,15 @@ def add_result(
         )
 
         modality_id_strings = "_".join(list(map(str, modality_ids)))
-        if not modality_id_strings in self.results:
-            self.results[modality_id_strings] = {}
-            self.results[modality_id_strings][task_name] = []
+        if not modality_id_strings in self.results[task_name]:
+            self.results[task_name][modality_id_strings] = []
 
-        self.results[modality_id_strings][task_name].append(entry)
+        self.results[task_name][modality_id_strings].append(entry)
 
     def print_results(self):
-        for modality in self.results.keys():
-            for task_name in self.task_names:
-                for entry in self.results[modality][task_name]:
+        for task_name in self.task_names:
+            for modality in self.results[task_name].keys():
+                for entry in self.results[task_name][modality]:
                     reps = []
                     for i, mod_idx in enumerate(entry.modality_ids):
                         reps.append(
@@ -182,10 +188,10 @@ def print_results(self):
                         if i < len(reps) - 1:
                             print(f"    Fusion: {entry.fusion_methods[i]} ")
 
-    def store_results(self):
-        for modality in self.results.keys():
-            for task_name in self.task_names:
-                for entry in self.results[modality][task_name]:
+    def store_results(self, file_name=None):
+        for task_name in self.task_names:
+            for modality in self.results[task_name].keys():
+                for entry in self.results[task_name][modality]:
                     reps = []
                     for i, mod_idx in enumerate(entry.modality_ids):
                         reps.append(
@@ -197,7 +203,14 @@ def store_results(self):
 
         import pickle
 
-        pickle.dump(self.results, open("multimodal_results.p", "wb"))
+        if file_name is None:
+            import time
+
+            timestr = time.strftime("%Y%m%d-%H%M%S")
+            file_name = "multimodal_optimizer" + timestr + ".pkl"
+
+        with open(file_name, "wb") as f:
+            pickle.dump(self.results, f)
 
 
 @dataclasses.dataclass
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index e27b805ae54..54c0ba52aed 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -49,6 +49,16 @@ def __init__(self, modalities, tasks, debug=True):
             if tasks[i - 1].expected_dim != tasks[i].expected_dim:
                 self._tasks_require_same_dims = False
 
+    def store_results(self, file_name=None):
+        if file_name is None:
+            import time
+
+            timestr = time.strftime("%Y%m%d-%H%M%S")
+            file_name = "unimodal_optimizer" + timestr + ".pkl"
+
+        with open(file_name, "wb") as f:
+            pickle.dump(self.operator_performance, f)
+
     def get_k_best_results(self, modality, k, task):
         """
         Get the k best results for the given modality
@@ -82,9 +92,10 @@ def optimize_parallel(self, n_workers=None):
                 # except Exception as exc:
                 #     print(f'Modality {modality.modality_id} generated an exception: {exc}')
 
-    def optimize(self, n_workers=None):
+    def optimize(self):
         for modality in self.modalities:
-            self._process_modality(modality)
+            local_result = self._process_modality(modality)
+            self._merge_results(local_result)
 
     def _process_modality(self, modality):
         local_results = UnimodalResults(
@@ -94,7 +105,10 @@ def _process_modality(self, modality):
 
         for context_operator in context_operators:
             context_representation = None
-            if modality.modality_type != ModalityType.TEXT:
+            if (
+                modality.modality_type != ModalityType.TEXT
+                and modality.modality_type != ModalityType.VIDEO
+            ):
                 con_op = context_operator()
                 context_representation = modality.context(con_op)
                 self._evaluate_local(context_representation, [con_op], local_results)
diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
index 773399eecdf..d17c703721b 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -62,7 +62,7 @@ def execute(self, modality):
 
             windowed_data.append(windowed_instance)
 
-        return np.array(windowed_data)
+        return windowed_data
 
     def window_aggregate_single_level(self, instance, new_length):
         if isinstance(instance, str):
@@ -86,4 +86,4 @@ def window_aggregate_nested_level(self, instance, new_length):
                 data[i * self.window_size : i * self.window_size + self.window_size]
             )
 
-        return result
+        return np.array(result)

From 9abf34e166694eb9ab76293ae0e58eceb18f3efa Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Tue, 5 Aug 2025 16:23:53 +0200
Subject: [PATCH 10/19] add timing

---
 .../scuro/drsearch/unimodal_optimizer.py      | 46 ++++++++++++++++---
 .../systemds/scuro/modality/modality.py       |  1 +
 .../systemds/scuro/modality/transformed.py    |  9 +++-
 .../scuro/modality/unimodal_modality.py       |  7 ++-
 4 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 54c0ba52aed..731e42623cd 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -19,6 +19,7 @@
 #
 # -------------------------------------------------------------
 import pickle
+import time
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass
 
@@ -157,17 +158,31 @@ def _evaluate_local(self, modality, representations, local_results):
                 reps.append(agg_operator)
                 agg_modality.pad()
                 for task in self.tasks:
+                    start = time.time()
                     scores = task.run(agg_modality.data)
+                    end = time.time()
 
                     local_results.add_result(
-                        scores, reps, modality.modality_id, task.model.name
+                        scores,
+                        reps,
+                        modality.modality_id,
+                        task.model.name,
+                        modality.transform_time,
+                        end - start,
                     )
             else:
                 modality.pad()
                 for task in self.tasks:
+                    start = time.time()
                     scores = task.run(modality.data)
+                    end = time.time()
                     local_results.add_result(
-                        scores, representations, modality.modality_id, task.model.name
+                        scores,
+                        representations,
+                        modality.modality_id,
+                        task.model.name,
+                        modality.transform_time,
+                        end - start,
                     )
         else:
             for task in self.tasks:
@@ -179,16 +194,29 @@ def _evaluate_local(self, modality, representations, local_results):
                     reps = representations.copy()
                     reps.append(agg_operator)
                     modality.pad()
+                    start = time.time()
                     scores = task.run(agg_modality.data)
-
+                    end = time.time()
                     local_results.add_result(
-                        scores, reps, modality.modality_id, task.model.name
+                        scores,
+                        reps,
+                        modality.modality_id,
+                        task.model.name,
+                        modality.transform_time,
+                        end - start,
                     )
                 else:
                     modality.pad()
+                    start = time.time()
                     scores = task.run(modality.data)
+                    end = time.time()
                     local_results.add_result(
-                        scores, representations, modality.modality_id, task.model.name
+                        scores,
+                        representations,
+                        modality.modality_id,
+                        task.model.name,
+                        modality.transform_time,
+                        end - start,
                     )
 
 
@@ -204,7 +232,9 @@ def __init__(self, modalities, tasks, debug=False):
             for task_name in self.task_names:
                 self.results[modality][task_name] = []
 
-    def add_result(self, scores, representations, modality_id, task_name):
+    def add_result(
+        self, scores, representations, modality_id, task_name, rep_time, task_time
+    ):
         parameters = []
         representation_names = []
 
@@ -230,6 +260,8 @@ def add_result(self, scores, representations, modality_id, task_name):
             params=parameters,
             train_score=scores[0],
             val_score=scores[1],
+            representation_time=rep_time,
+            task_time=task_time,
         )
         self.results[modality_id][task_name].append(entry)
 
@@ -249,3 +281,5 @@ class ResultEntry:
     representations: list
     params: list
     train_score: float
+    representation_time: float
+    task_time: float
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 32e68eff086..272696940fe 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -44,6 +44,7 @@ def __init__(
         self.cost = None
         self.shape = None
         self.modality_id = modality_id
+        self.transform_time = None
 
     @property
     def data(self):
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 1a292b495b0..63ca10251c4 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -26,6 +26,7 @@
 from systemds.scuro.modality.joined import JoinedModality
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.window_aggregation import WindowAggregation
+import time
 
 
 class TransformedModality(Modality):
@@ -73,19 +74,23 @@ def join(self, right, join_condition):
     def window_aggregation(self, windowSize, aggregation):
         w = WindowAggregation(windowSize, aggregation)
         transformed_modality = TransformedModality(self, w)
+        start = time.time()
         transformed_modality.data = w.execute(self)
-
+        transformed_modality.transform_time = time.time() - start
         return transformed_modality
 
     def context(self, context_operator):
         transformed_modality = TransformedModality(self, context_operator)
-
+        start = time.time()
         transformed_modality.data = context_operator.execute(self)
+        transformed_modality.transform_time = time.time() - start
         return transformed_modality
 
     def apply_representation(self, representation):
+        start = time.time()
         new_modality = representation.transform(self)
         new_modality.update_metadata()
+        new_modality.transform_time = time.time() - start
         return new_modality
 
     def combine(self, other: Union[Modality, List[Modality]], fusion_method):
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index fb117aa32e8..8b318cf3a7f 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 from functools import reduce
 from operator import or_
-
+import time
 
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.modality.modality import Modality
@@ -86,12 +86,14 @@ def join(self, other, join_condition):
         return joined_modality
 
     def context(self, context_operator):
+        start = time.time()
         if not self.has_data():
             self.extract_raw_data()
 
         transformed_modality = TransformedModality(self, context_operator)
 
         transformed_modality.data = context_operator.execute(self)
+        transformed_modality.transform_time = time.time() - start
         return transformed_modality
 
     def aggregate(self, aggregation_function):
@@ -108,7 +110,7 @@ def apply_representation(self, representation):
             representation,
         )
         new_modality.data = []
-
+        start = time.time()
         if self.data_loader.chunk_size:
             self.data_loader.reset()
             while self.data_loader.next_chunk < self.data_loader.num_chunks:
@@ -122,4 +124,5 @@ def apply_representation(self, representation):
             new_modality = representation.transform(self)
 
         new_modality.update_metadata()
+        new_modality.transform_time = time.time() - start
         return new_modality

From 302501e9e91f8d7ab2027f5a9d5293d7f2addf74 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Wed, 6 Aug 2025 15:51:25 +0200
Subject: [PATCH 11/19] add padding

---
 .../systemds/scuro/modality/modality.py       |  4 ++-
 .../python/systemds/scuro/modality/type.py    |  9 ++++-
 .../scuro/modality/unimodal_modality.py       | 36 ++++++++++++++++++-
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 272696940fe..e4240690f18 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -62,9 +62,11 @@ def get_modality_names(self) -> List[str]:
         """
         Extracts the individual unimodal modalities for a given transformed modality.
         """
-        return [
+        modality_names = [
             modality.name for modality in ModalityType if modality in self.modality_type
         ]
+        modality_names.append(str(self.modality_id))
+        return modality_names
 
     def copy_from_instance(self):
         """
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index c0baaf4c6aa..e04ef685689 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -26,6 +26,7 @@
     calculate_new_frequency,
     create_timestamps,
 )
+import torch
 
 
 # TODO: needs a way to define if data comes from a dataset with multiple instances or is like a streaming scenario where we only have one instance
@@ -203,6 +204,12 @@ def add_field(self, md, field, data):
         md[field] = data
         return md
 
+    def add_field_for_instances(self, md, field, data):
+        for key, value in zip(md.keys(), data):
+            md[key].update({field: value})
+
+        return md
+
     def create_audio_metadata(self, sampling_rate, data):
         md = deepcopy(self.get_schema())
         md = ModalitySchemas.update_base_metadata(md, data, True)
@@ -246,7 +253,7 @@ def get_data_layout(cls, data, data_is_single_instance):
         if data_is_single_instance:
             if isinstance(data, list):
                 return DataLayout.SINGLE_LEVEL
-            elif isinstance(data, np.ndarray):
+            elif isinstance(data, np.ndarray) or isinstance(data, torch.Tensor):
                 return DataLayout.NESTED_LEVEL
 
         if isinstance(data[0], list):
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 8b318cf3a7f..edcab8cec75 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -21,7 +21,8 @@
 from functools import reduce
 from operator import or_
 import time
-
+import numpy as np
+from systemds.scuro import ModalityType
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.modality.joined import JoinedModality
@@ -113,11 +114,44 @@ def apply_representation(self, representation):
         start = time.time()
         if self.data_loader.chunk_size:
             self.data_loader.reset()
+            original_lengths = []
             while self.data_loader.next_chunk < self.data_loader.num_chunks:
                 self.extract_raw_data()
                 transformed_chunk = representation.transform(self)
                 new_modality.data.extend(transformed_chunk.data)
+                for d in transformed_chunk.data:
+                    original_lengths.append(d.shape[0])
                 new_modality.metadata.update(transformed_chunk.metadata)
+
+            target_length = max(original_lengths)
+            padded_embeddings = []
+            for embeddings in new_modality.data:
+                current_length = embeddings.shape[0]
+                if current_length < target_length:
+                    padding_needed = target_length - current_length
+
+                    padded = np.pad(
+                        embeddings,
+                        pad_width=(
+                            (0, padding_needed),
+                            (0, 0),
+                        ),  # (before, after) for each axis
+                        mode="constant",
+                        constant_values=0,
+                    )
+                    padded_embeddings.append(padded)
+                else:
+                    padded_embeddings.append(embeddings)
+
+            attention_masks = np.zeros((len(new_modality.data), target_length))
+            for i, length in enumerate(original_lengths):
+                attention_masks[i, :length] = 1
+
+            ModalityType(self.modality_type).add_field_for_instances(
+                new_modality.metadata, "attention_masks", attention_masks
+            )
+            new_modality.data = padded_embeddings
+
         else:
             if not self.has_data():
                 self.extract_raw_data()

From 6d31982caf10898373c6d2f4d6ac3f6584e91da8 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Thu, 7 Aug 2025 11:19:38 +0200
Subject: [PATCH 12/19] adapt padding to window operations

---
 .../systemds/scuro/drsearch/dr_search.py      |  2 +-
 .../scuro/drsearch/unimodal_optimizer.py      |  8 +--
 .../scuro/modality/joined_transformed.py      |  3 +-
 .../python/systemds/scuro/modality/type.py    | 18 +++++--
 .../scuro/modality/unimodal_modality.py       | 21 +++++---
 .../systemds/scuro/representations/bert.py    | 53 ++++++++++---------
 .../scuro/representations/mel_spectrogram.py  |  9 ++--
 .../systemds/scuro/representations/mfcc.py    |  9 ++--
 .../scuro/representations/spectrogram.py      |  9 ++--
 .../systemds/scuro/representations/tfidf.py   |  3 +-
 .../systemds/scuro/representations/wav2vec.py |  4 +-
 .../representations/window_aggregation.py     | 47 ++++++++++++++--
 .../scuro/representations/word2vec.py         |  4 +-
 .../systemds/scuro/utils/static_variables.py  |  9 ++++
 src/main/python/tests/scuro/data_generator.py | 37 +++++++------
 .../tests/scuro/test_multimodal_join.py       |  4 --
 .../tests/scuro/test_unimodal_optimizer.py    | 16 ++----
 17 files changed, 159 insertions(+), 97 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/dr_search.py b/src/main/python/systemds/scuro/drsearch/dr_search.py
index 2000608a1df..601001c7428 100644
--- a/src/main/python/systemds/scuro/drsearch/dr_search.py
+++ b/src/main/python/systemds/scuro/drsearch/dr_search.py
@@ -76,7 +76,7 @@ def set_best_params(
         """
 
         # check if modality name is already in dictionary
-        if "_".join(modality_names) not in self.scores.keys():
+        if "_".join(modality_names) not in list(self.scores.keys()):
             # if not add it to dictionary
             self.scores["_".join(modality_names)] = {}
 
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 731e42623cd..21dce69840c 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -156,7 +156,7 @@ def _evaluate_local(self, modality, representations, local_results):
                 agg_modality = agg_operator.transform(modality)
                 reps = representations.copy()
                 reps.append(agg_operator)
-                agg_modality.pad()
+                # agg_modality.pad()
                 for task in self.tasks:
                     start = time.time()
                     scores = task.run(agg_modality.data)
@@ -171,7 +171,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         end - start,
                     )
             else:
-                modality.pad()
+                # modality.pad()
                 for task in self.tasks:
                     start = time.time()
                     scores = task.run(modality.data)
@@ -193,7 +193,7 @@ def _evaluate_local(self, modality, representations, local_results):
 
                     reps = representations.copy()
                     reps.append(agg_operator)
-                    modality.pad()
+                    # modality.pad()
                     start = time.time()
                     scores = task.run(agg_modality.data)
                     end = time.time()
@@ -206,7 +206,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         end - start,
                     )
                 else:
-                    modality.pad()
+                    # modality.pad()
                     start = time.time()
                     scores = task.run(modality.data)
                     end = time.time()
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
index 6c6190e03cc..3e0d8fb9dfb 100644
--- a/src/main/python/systemds/scuro/modality/joined_transformed.py
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -36,7 +36,8 @@ def __init__(self, left_modality, right_modality, transformation):
         :param transformation: Representation to be applied on the modality
         """
         super().__init__(
-            reduce(or_, [left_modality.modality_type], right_modality.modality_type)
+            reduce(or_, [left_modality.modality_type], right_modality.modality_type),
+            data_type=left_modality.data_type,
         )
         self.transformation = transformation
         self.left_modality = left_modality
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index e04ef685689..b2331d0faed 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -100,8 +100,12 @@ def update_base_metadata(cls, md, data, data_is_single_instance=True):
         dtype = np.nan
         shape = None
         if data_layout is DataLayout.SINGLE_LEVEL:
-            dtype = data[0].dtype
-            shape = data[0].shape
+            if isinstance(data, list):
+                dtype = data[0].dtype
+                shape = data[0].shape
+            elif isinstance(data, np.ndarray):
+                dtype = data.dtype
+                shape = data.shape
         elif data_layout is DataLayout.NESTED_LEVEL:
             if data_is_single_instance:
                 dtype = data.dtype
@@ -210,9 +214,9 @@ def add_field_for_instances(self, md, field, data):
 
         return md
 
-    def create_audio_metadata(self, sampling_rate, data):
+    def create_audio_metadata(self, sampling_rate, data, is_single_instance=True):
         md = deepcopy(self.get_schema())
-        md = ModalitySchemas.update_base_metadata(md, data, True)
+        md = ModalitySchemas.update_base_metadata(md, data, is_single_instance)
         md["frequency"] = sampling_rate
         md["length"] = data.shape[0]
         md["timestamp"] = create_timestamps(sampling_rate, md["length"])
@@ -251,7 +255,11 @@ def get_data_layout(cls, data, data_is_single_instance):
             return None
 
         if data_is_single_instance:
-            if isinstance(data, list):
+            if (
+                isinstance(data, list)
+                or isinstance(data, np.ndarray)
+                and data.ndim == 1
+            ):
                 return DataLayout.SINGLE_LEVEL
             elif isinstance(data, np.ndarray) or isinstance(data, torch.Tensor):
                 return DataLayout.NESTED_LEVEL
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index edcab8cec75..94d1fa057d9 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -112,9 +112,9 @@ def apply_representation(self, representation):
         )
         new_modality.data = []
         start = time.time()
+        original_lengths = []
         if self.data_loader.chunk_size:
             self.data_loader.reset()
-            original_lengths = []
             while self.data_loader.next_chunk < self.data_loader.num_chunks:
                 self.extract_raw_data()
                 transformed_chunk = representation.transform(self)
@@ -122,7 +122,18 @@ def apply_representation(self, representation):
                 for d in transformed_chunk.data:
                     original_lengths.append(d.shape[0])
                 new_modality.metadata.update(transformed_chunk.metadata)
+        else:
+            if not self.has_data():
+                self.extract_raw_data()
+            new_modality = representation.transform(self)
+
+            if not all(
+                "attention_masks" in entry for entry in new_modality.metadata.values()
+            ):
+                for d in new_modality.data:
+                    original_lengths.append(d.shape[0])
 
+        if len(original_lengths) > 0 and min(original_lengths) < max(original_lengths):
             target_length = max(original_lengths)
             padded_embeddings = []
             for embeddings in new_modality.data:
@@ -135,7 +146,7 @@ def apply_representation(self, representation):
                         pad_width=(
                             (0, padding_needed),
                             (0, 0),
-                        ),  # (before, after) for each axis
+                        ),
                         mode="constant",
                         constant_values=0,
                     )
@@ -151,12 +162,6 @@ def apply_representation(self, representation):
                 new_modality.metadata, "attention_masks", attention_masks
             )
             new_modality.data = padded_embeddings
-
-        else:
-            if not self.has_data():
-                self.extract_raw_data()
-            new_modality = representation.transform(self)
-
         new_modality.update_metadata()
         new_modality.transform_time = time.time() - start
         return new_modality
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 8d8d40f4fd7..e6611f0b7c9 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -34,12 +34,13 @@
 
 @register_representation(ModalityType.TEXT)
 class Bert(UnimodalRepresentation):
-    def __init__(self, model_name="bert", output_file=None):
+    def __init__(self, model_name="bert", output_file=None, max_seq_length=512):
         parameters = {"model_name": "bert"}
         self.model_name = model_name
         super().__init__("Bert", ModalityType.EMBEDDING, parameters)
 
         self.output_file = output_file
+        self.max_seq_length = max_seq_length
 
     def transform(self, modality):
         transformed_modality = TransformedModality(modality, self)
@@ -59,28 +60,28 @@ def transform(self, modality):
         return transformed_modality
 
     def create_embeddings(self, modality, model, tokenizer):
-        embeddings = []
-        for i, d in enumerate(modality.data):
-            inputs = tokenizer(
-                d,
-                return_offsets_mapping=True,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-            )
-
-            ModalityType.TEXT.add_field(
-                list(modality.metadata.values())[i],
-                "token_to_character_mapping",
-                inputs.data["offset_mapping"][0].tolist(),
-            )
-
-            del inputs.data["offset_mapping"]
-
-            with torch.no_grad():
-                outputs = model(**inputs)
-
-                cls_embedding = outputs.last_hidden_state[0].numpy()
-                embeddings.append(cls_embedding)
-
-        return embeddings
+        inputs = tokenizer(
+            modality.data,
+            return_offsets_mapping=True,
+            return_tensors="pt",
+            padding="longest",
+            return_attention_mask=True,
+            truncation=True,
+        )
+        ModalityType.TEXT.add_field_for_instances(
+            modality.metadata,
+            "token_to_character_mapping",
+            inputs.data["offset_mapping"].tolist(),
+        )
+
+        ModalityType.TEXT.add_field_for_instances(
+            modality.metadata, "attention_masks", inputs.data["attention_mask"].tolist()
+        )
+        del inputs.data["offset_mapping"]
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+            cls_embedding = outputs.last_hidden_state.detach().numpy()
+
+        return cls_embedding
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 8c14c03ac60..8e897542b0c 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -46,19 +46,18 @@ def transform(self, modality):
             modality, self, self.output_modality_type
         )
         result = []
-        max_length = 0
+
         for i, sample in enumerate(modality.data):
             sr = list(modality.metadata.values())[i]["frequency"]
             S = librosa.feature.melspectrogram(
-                y=sample,
+                y=np.array(sample),
                 sr=sr,
                 n_mels=self.n_mels,
                 hop_length=self.hop_length,
                 n_fft=self.n_fft,
-            )
+            ).astype(modality.data_type)
             S_dB = librosa.power_to_db(S, ref=np.max)
-            if S_dB.shape[-1] > max_length:
-                max_length = S_dB.shape[-1]
+
             result.append(S_dB.T)
 
         transformed_modality.data = result
diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py
index 234e93246fd..00f735a756e 100644
--- a/src/main/python/systemds/scuro/representations/mfcc.py
+++ b/src/main/python/systemds/scuro/representations/mfcc.py
@@ -48,20 +48,19 @@ def transform(self, modality):
             modality, self, self.output_modality_type
         )
         result = []
-        max_length = 0
+
         for i, sample in enumerate(modality.data):
             sr = list(modality.metadata.values())[i]["frequency"]
             mfcc = librosa.feature.mfcc(
-                y=sample,
+                y=np.array(sample),
                 sr=sr,
                 n_mfcc=self.n_mfcc,
                 dct_type=self.dct_type,
                 hop_length=self.hop_length,
                 n_mels=self.n_mels,
-            )
+            ).astype(modality.data_type)
             mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
-            if mfcc.shape[-1] > max_length:  # TODO: check if this needs to be done
-                max_length = mfcc.shape[-1]
+
             result.append(mfcc.T)
 
         transformed_modality.data = result
diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py
index 6a713a3d21c..5fb1780536d 100644
--- a/src/main/python/systemds/scuro/representations/spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/spectrogram.py
@@ -41,14 +41,13 @@ def transform(self, modality):
             modality, self, self.output_modality_type
         )
         result = []
-        max_length = 0
+
         for i, sample in enumerate(modality.data):
             spectrogram = librosa.stft(
-                y=sample, hop_length=self.hop_length, n_fft=self.n_fft
-            )
+                y=np.array(sample), hop_length=self.hop_length, n_fft=self.n_fft
+            ).astype(modality.data_type)
             S_dB = librosa.amplitude_to_db(np.abs(spectrogram))
-            if S_dB.shape[-1] > max_length:
-                max_length = S_dB.shape[-1]
+
             result.append(S_dB.T)
 
         transformed_modality.data = result
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index 1df5a1fde08..c26039f46a9 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -43,8 +43,7 @@ def transform(self, modality):
         vectorizer = TfidfVectorizer(min_df=self.min_df)
 
         X = vectorizer.fit_transform(modality.data)
-        X = [np.array(x).reshape(1, -1) for x in X.toarray()]
-
+        X = [np.array(x).astype(np.float32).reshape(1, -1) for x in X.toarray()]
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
 
diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py
index 29f5bcbea02..86145e3769e 100644
--- a/src/main/python/systemds/scuro/representations/wav2vec.py
+++ b/src/main/python/systemds/scuro/representations/wav2vec.py
@@ -52,7 +52,9 @@ def transform(self, modality):
         result = []
         for i, sample in enumerate(modality.data):
             sr = list(modality.metadata.values())[i]["frequency"]
-            audio_resampled = librosa.resample(sample, orig_sr=sr, target_sr=16000)
+            audio_resampled = librosa.resample(
+                np.array(sample), orig_sr=sr, target_sr=16000
+            )
             input = self.processor(
                 audio_resampled, sampling_rate=16000, return_tensors="pt", padding=True
             )
diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
index d17c703721b..167f4adafea 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -21,7 +21,7 @@
 import numpy as np
 import math
 
-from systemds.scuro.modality.type import DataLayout
+from systemds.scuro.modality.type import DataLayout, ModalityType
 
 from systemds.scuro.drsearch.operator_registry import register_context_operator
 from systemds.scuro.representations.aggregate import Aggregation
@@ -30,7 +30,7 @@
 
 @register_context_operator()
 class WindowAggregation(Context):
-    def __init__(self, window_size=10, aggregation_function="mean"):
+    def __init__(self, window_size=10, aggregation_function="mean", pad=True):
         parameters = {
             "window_size": [window_size],
             "aggregation_function": list(Aggregation().get_aggregation_functions()),
@@ -38,6 +38,7 @@ def __init__(self, window_size=10, aggregation_function="mean"):
         super().__init__("WindowAggregation", parameters)
         self.window_size = window_size
         self.aggregation_function = aggregation_function
+        self.pad = pad
 
     @property
     def aggregation_function(self):
@@ -49,6 +50,7 @@ def aggregation_function(self, value):
 
     def execute(self, modality):
         windowed_data = []
+        original_lengths = []
         for instance in modality.data:
             new_length = math.ceil(len(instance) / self.window_size)
             if modality.get_data_layout() == DataLayout.SINGLE_LEVEL:
@@ -59,14 +61,53 @@ def execute(self, modality):
                 windowed_instance = self.window_aggregate_nested_level(
                     instance, new_length
                 )
-
+            original_lengths.append(new_length)
             windowed_data.append(windowed_instance)
 
+        if self.pad and not isinstance(windowed_data, np.ndarray):
+            target_length = max(original_lengths)
+            sample_shape = windowed_data[0].shape
+            is_1d = len(sample_shape) == 1
+
+            padded_features = []
+            for i, features in enumerate(windowed_data):
+                current_len = original_lengths[i]
+
+                if current_len < target_length:
+                    padding_needed = target_length - current_len
+
+                    if is_1d:
+                        padding = np.zeros(padding_needed)
+                        padded = np.concatenate([features, padding])
+                    else:
+                        feature_dim = features.shape[-1]
+                        padding = np.zeros((padding_needed, feature_dim))
+                        padded = np.concatenate([features, padding], axis=0)
+
+                    padded_features.append(padded)
+                else:
+                    padded_features.append(features)
+
+            attention_masks = np.zeros((len(windowed_data), target_length))
+            for i, length in enumerate(original_lengths):
+                actual_length = min(length, target_length)
+                attention_masks[i, :actual_length] = 1
+
+            ModalityType(modality.modality_type).add_field_for_instances(
+                modality.metadata, "attention_masks", attention_masks
+            )
+
+            windowed_data = np.array(padded_features)
+            data_type = list(modality.metadata.values())[0]["data_layout"]["type"]
+            if data_type != "str":
+                windowed_data = windowed_data.astype(data_type)
+
         return windowed_data
 
     def window_aggregate_single_level(self, instance, new_length):
         if isinstance(instance, str):
             return instance
+        instance = np.array(instance)
         num_cols = instance.shape[1] if instance.ndim > 1 else 1
         result = np.empty((new_length, num_cols))
         for i in range(0, new_length):
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 0210207a013..aa28499e636 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -65,7 +65,9 @@ def transform(self, modality):
         embeddings = []
         for sentences in modality.data:
             tokens = list(tokenize(sentences.lower()))
-            embeddings.append(np.array(get_embedding(tokens, model)).reshape(1, -1))
+            embeddings.append(
+                np.array(get_embedding(tokens, model)).reshape(1, -1).astype(np.float32)
+            )
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
diff --git a/src/main/python/systemds/scuro/utils/static_variables.py b/src/main/python/systemds/scuro/utils/static_variables.py
index b1733387160..b1b3e657a24 100644
--- a/src/main/python/systemds/scuro/utils/static_variables.py
+++ b/src/main/python/systemds/scuro/utils/static_variables.py
@@ -1,7 +1,16 @@
 import numpy as np
+import torch
 
 global_rng = np.random.default_rng(42)
 
 
 def get_seed():
     return global_rng.integers(0, 1024)
+
+
+def get_device():
+    return torch.device(
+        "cuda:0"
+        if torch.cuda.is_available()
+        else "mps" if torch.mps.is_available() else "cpu"
+    )
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index fbb50ac180e..e2dceec329d 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -93,10 +93,14 @@ def create1DModality(
         self.modality_id += 1
         return tf_modality
 
-    def create_audio_data(self, num_instances, num_features):
-        data = np.random.rand(num_instances, num_features).astype(np.float32)
+    def create_audio_data(self, num_instances, max_audio_length):
+        data = [
+            [random.random() for _ in range(random.randint(1, max_audio_length))]
+            for _ in range(num_instances)
+        ]
+
         metadata = {
-            i: ModalityType.AUDIO.create_audio_metadata(16000, data[i])
+            i: ModalityType.AUDIO.create_audio_metadata(16000, np.array(data[i]))
             for i in range(num_instances)
         }
 
@@ -165,26 +169,29 @@ def create_text_data(self, num_instances):
 
         return sentences, metadata
 
-    def create_visual_modality(self, num_instances, num_frames=1, height=28, width=28):
-        if num_frames == 1:
+    def create_visual_modality(
+        self, num_instances, max_num_frames=1, height=28, width=28
+    ):
+        data = [
+            np.random.randint(
+                0,
+                256,
+                (np.random.randint(5, max_num_frames + 1), height, width, 3),
+                dtype=np.uint8,
+            )
+            for _ in range(num_instances)
+        ]
+        if max_num_frames == 1:
             print(f"TODO: create image metadata")
         else:
             metadata = {
                 i: ModalityType.VIDEO.create_video_metadata(
-                    30, num_frames, width, height, 1
+                    30, data[i].shape[0], width, height, 3
                 )
                 for i in range(num_instances)
             }
 
-        return (
-            np.random.randint(
-                0,
-                256,
-                (num_instances, num_frames, height, width),
-                # ).astype(np.float16).tolist(),
-            ).astype(np.float16),
-            metadata,
-        )
+        return (data, metadata)
 
 
 def setup_data(modalities, num_instances, path):
diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py
index 9e3a16ffcad..5fd22dc8d98 100644
--- a/src/main/python/tests/scuro/test_multimodal_join.py
+++ b/src/main/python/tests/scuro/test_multimodal_join.py
@@ -20,7 +20,6 @@
 
 # TODO: Test edge cases: unequal number of audio-video timestamps (should still work and add the average over all audio/video samples)
 
-import shutil
 import unittest
 
 import numpy as np
@@ -30,9 +29,6 @@
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
 from tests.scuro.data_generator import TestDataLoader, ModalityRandomDataGenerator
-
-from systemds.scuro.dataloader.audio_loader import AudioLoader
-from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.modality.type import ModalityType
 
 
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index 41bd2af1367..192567e92ee 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -182,23 +182,17 @@ def optimize_unimodal_representation_for_modality(self, modality):
         ):
             registry = Registry()
 
-            unimodal_optimizer = UnimodalOptimizer(
-                [modality], self.tasks
-            )
+            unimodal_optimizer = UnimodalOptimizer([modality], self.tasks)
             unimodal_optimizer.optimize()
 
             assert (
-                list(unimodal_optimizer.operator_performance.keys())[0]
+                unimodal_optimizer.operator_performance.modality_ids[0]
                 == modality.modality_id
             )
-            assert len(list(unimodal_optimizer.operator_performance.values())[0]) == 2
+            assert len(unimodal_optimizer.operator_performance.task_names) == 2
             assert (
-                len(
-                    unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0])[
-                        0
-                    ].representations
-                )
-                >= 1
+                len(unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0]))
+                == 1
             )
 
 

From 1c37f2e8645a149f79b434a701167814d4230ab3 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Thu, 7 Aug 2025 16:59:20 +0200
Subject: [PATCH 13/19] add cache

---
 .../scuro/drsearch/multimodal_optimizer.py    | 124 ++++++++++++++----
 .../scuro/drsearch/unimodal_optimizer.py      |  95 ++++++++------
 .../systemds/scuro/modality/modality.py       |  20 +--
 .../scuro/representations/word2vec.py         |   4 +-
 4 files changed, 166 insertions(+), 77 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index e6d7abd25c3..ff5c431aa36 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -1,3 +1,5 @@
+import itertools
+
 from systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
@@ -11,13 +13,17 @@
 
 
 class MultimodalOptimizer:
-    def __init__(self, modalities, unimodal_optimizer, tasks, k=2, debug=True):
+    def __init__(
+        self, modalities, unimodal_optimization_results, tasks, k=2, debug=True
+    ):
+        self.k_best_cache = None
         self.k_best_modalities = None
         self.modalities = modalities
-        self.unimodal_optimizer = unimodal_optimizer
+        self.unimodal_optimization_results = unimodal_optimization_results
         self.tasks = tasks
         self.k = k
         self.extract_k_best_modalities_per_task()
+
         self.operator_registry = Registry()
         self.optimization_results = MultimodalResults(
             modalities, tasks, debug, self.k_best_modalities
@@ -26,7 +32,7 @@ def __init__(self, modalities, unimodal_optimizer, tasks, k=2, debug=True):
 
     def optimize(self):
         for task in self.tasks:
-            self.optimize_intramodal_representations(task)
+            # self.optimize_intramodal_representations(task)
             self.optimize_intermodal_representations(task)
 
     def optimize_intramodal_representations(self, task):
@@ -35,7 +41,7 @@ def optimize_intramodal_representations(self, task):
                 modality.modality_id
             ]
             applied_representations = self.extract_representations(
-                representations, modality
+                representations, modality, task.model.name
             )
 
             for i in range(1, len(applied_representations)):
@@ -73,33 +79,76 @@ def optimize_intramodal_representations(self, task):
                         )
 
     def optimize_intermodal_representations(self, task):
-        pass
+        modality_combos = []
+        n = len(self.k_best_cache[task.model.name])
+
+        # Generate combinations in depth-first order
+        def generate_extensions(current_combo, remaining_indices):
+            # Add current combination if it has at least 2 elements
+            if len(current_combo) >= 2:
+                combo_tuple = tuple(i for i in current_combo)
+                modality_combos.append(combo_tuple)
+            # Generate all possible extensions
+            for i in remaining_indices:
+                new_combo = current_combo + [i]
+                new_remaining = [j for j in remaining_indices if j > i]
+                generate_extensions(new_combo, new_remaining)
+
+        # Start with each possible first element
+        for start_idx in range(n):
+            remaining = list(range(start_idx + 1, n))
+            generate_extensions([start_idx], remaining)
 
-    def extract_representations(self, representations, modality):
+        print(modality_combos)
+
+    def _evaluate_inter_modal(self, task, modality_combo):
+        fused_representation = None
+        for modality_id in modality_combo:
+            fused_representation = self.evaluate(task, modality_id, None, None, None)
+
+    def extract_representations(self, representations, modality, task_name):
         applied_representations = []
         for i in range(0, len(representations)):
-            applied_representation = modality
-            for j, rep in enumerate(representations[i].representations):
-                representation, is_context = (
-                    self.operator_registry.get_representation_by_name(
-                        rep, modality.modality_type
-                    )
+            cache_key = (
+                tuple(representations[i].representations),
+                representations[i].task_time,
+                representations[i].representation_time,
+            )
+            if (
+                cache_key
+                in self.unimodal_optimization_results.cache[modality.modality_id][
+                    task_name
+                ]
+            ):
+                applied_representations.append(
+                    self.unimodal_optimization_results.cache[modality.modality_id][
+                        task_name
+                    ][cache_key]
                 )
-                if representation is None:
-                    if rep == AggregatedRepresentation.__name__:
-                        representation = AggregatedRepresentation(Aggregation())
-                else:
-                    representation = representation()
-                representation.set_parameters(representations[i].params[j])
-                if is_context:
-                    applied_representation = applied_representation.context(
-                        representation
-                    )
-                else:
-                    applied_representation = (
-                        applied_representation.apply_representation(representation)
+            else:
+                applied_representation = modality
+                for j, rep in enumerate(representations[i].representations):
+                    representation, is_context = (
+                        self.operator_registry.get_representation_by_name(
+                            rep, modality.modality_type
+                        )
                     )
-            applied_representations.append(applied_representation)
+                    if representation is None:
+                        if rep == AggregatedRepresentation.__name__:
+                            representation = AggregatedRepresentation(Aggregation())
+                    else:
+                        representation = representation()
+                    representation.set_parameters(representations[i].params[j])
+                    if is_context:
+                        applied_representation = applied_representation.context(
+                            representation
+                        )
+                    else:
+                        applied_representation = (
+                            applied_representation.apply_representation(representation)
+                        )
+                self.k_best_cache[task_name].append(applied_representation)
+                applied_representations.append(applied_representation)
         return applied_representations
 
     def evaluate(self, task, modality, representations, fusion, modality_ids):
@@ -127,13 +176,32 @@ def add_to_cache(self, result_idx, combined_modality):
 
     def extract_k_best_modalities_per_task(self):
         self.k_best_modalities = {}
+        self.k_best_cache = {}
         for task in self.tasks:
             self.k_best_modalities[task.model.name] = {}
+            self.k_best_cache[task.model.name] = []
             for modality in self.modalities:
-                self.k_best_modalities[task.model.name][modality.modality_id] = (
-                    self.unimodal_optimizer.get_k_best_results(modality, self.k, task)
+                k_best_results, cached_data = (
+                    self.unimodal_optimization_results.get_k_best_results(
+                        modality, self.k, task
+                    )
                 )
 
+                self.k_best_modalities[task.model.name][
+                    modality.modality_id
+                ] = k_best_results
+                self.k_best_cache[task.model.name].extend(cached_data)
+
+    def create_modality_index(self, task):
+        counter = 0
+        k_best_idx = []
+        for modality_id, values in self.k_best_modalities[task].items():
+            for _ in values:
+                k_best_idx.append((modality_id, counter))
+                counter += 1
+
+        return k_best_idx
+
 
 class MultimodalResults:
     def __init__(self, modalities, tasks, debug, k_best_modalities):
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 21dce69840c..b6f75162c00 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -21,9 +21,12 @@
 import pickle
 import time
 from concurrent.futures import ProcessPoolExecutor, as_completed
-from dataclasses import dataclass
+from dataclasses import dataclass, field, asdict
 
 import multiprocessing as mp
+from typing import Union
+
+import numpy as np
 from systemds.scuro.representations.window_aggregation import WindowAggregation
 
 from build.lib.systemds.scuro.representations.aggregated_representation import (
@@ -60,28 +63,13 @@ def store_results(self, file_name=None):
         with open(file_name, "wb") as f:
             pickle.dump(self.operator_performance, f)
 
-    def get_k_best_results(self, modality, k, task):
-        """
-        Get the k best results for the given modality
-        :param modality: modality to get the best results for
-        :param k: number of best results
-        """
-
-        results = sorted(
-            self.operator_performance.results[modality.modality_id][task.model.name],
-            key=lambda x: x.val_score,
-            reverse=True,
-        )[:k]
-
-        return results
-
     def optimize_parallel(self, n_workers=None):
         if n_workers is None:
             n_workers = min(len(self.modalities), mp.cpu_count())
 
         with ProcessPoolExecutor(max_workers=n_workers) as executor:
             future_to_modality = {
-                executor.submit(self._process_modality, modality): modality
+                executor.submit(self._process_modality, modality, True): modality
                 for modality in self.modalities
             }
 
@@ -95,13 +83,17 @@ def optimize_parallel(self, n_workers=None):
 
     def optimize(self):
         for modality in self.modalities:
-            local_result = self._process_modality(modality)
-            self._merge_results(local_result)
+            local_result = self._process_modality(modality, False)
+            # self._merge_results(local_result)
+
+    def _process_modality(self, modality, parallel):
+        if parallel:
+            local_results = UnimodalResults(
+                modalities=[modality], tasks=self.tasks, debug=False
+            )
+        else:
+            local_results = self.operator_performance
 
-    def _process_modality(self, modality):
-        local_results = UnimodalResults(
-            modalities=[modality], tasks=self.tasks, debug=False
-        )
         context_operators = self.operator_registry.get_context_operators()
 
         for context_operator in context_operators:
@@ -148,6 +140,11 @@ def _merge_results(self, local_results):
                     local_results.results[modality_id][task_name]
                 )
 
+        for modality in self.modalities:
+            for task_name in local_results.cache[modality]:
+                for key, value in local_results.cache[modality][task_name].items():
+                    self.operator_performance.cache[modality][task_name][key] = value
+
     def _evaluate_local(self, modality, representations, local_results):
         if self._tasks_require_same_dims:
             if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
@@ -165,9 +162,8 @@ def _evaluate_local(self, modality, representations, local_results):
                     local_results.add_result(
                         scores,
                         reps,
-                        modality.modality_id,
+                        modality,
                         task.model.name,
-                        modality.transform_time,
                         end - start,
                     )
             else:
@@ -179,9 +175,8 @@ def _evaluate_local(self, modality, representations, local_results):
                     local_results.add_result(
                         scores,
                         representations,
-                        modality.modality_id,
+                        modality,
                         task.model.name,
-                        modality.transform_time,
                         end - start,
                     )
         else:
@@ -200,9 +195,8 @@ def _evaluate_local(self, modality, representations, local_results):
                     local_results.add_result(
                         scores,
                         reps,
-                        modality.modality_id,
+                        modality,
                         task.model.name,
-                        modality.transform_time,
                         end - start,
                     )
                 else:
@@ -213,9 +207,8 @@ def _evaluate_local(self, modality, representations, local_results):
                     local_results.add_result(
                         scores,
                         representations,
-                        modality.modality_id,
+                        modality,
                         task.model.name,
-                        modality.transform_time,
                         end - start,
                     )
 
@@ -226,15 +219,16 @@ def __init__(self, modalities, tasks, debug=False):
         self.task_names = [task.model.name for task in tasks]
         self.results = {}
         self.debug = debug
+        self.cache = {}
 
         for modality in self.modality_ids:
             self.results[modality] = {}
+            self.cache[modality] = {}
             for task_name in self.task_names:
+                self.cache[modality][task_name] = {}
                 self.results[modality][task_name] = []
 
-    def add_result(
-        self, scores, representations, modality_id, task_name, rep_time, task_time
-    ):
+    def add_result(self, scores, representations, modality, task_name, task_time):
         parameters = []
         representation_names = []
 
@@ -260,13 +254,16 @@ def add_result(
             params=parameters,
             train_score=scores[0],
             val_score=scores[1],
-            representation_time=rep_time,
+            representation_time=modality.transform_time,
             task_time=task_time,
         )
-        self.results[modality_id][task_name].append(entry)
+        self.results[modality.modality_id][task_name].append(entry)
+        self.cache[modality.modality_id][task_name][
+            (tuple(representation_names), scores[1], modality.transform_time)
+        ] = modality
 
         if self.debug:
-            print(f"{modality_id}_{task_name}: {entry}")
+            print(f"{modality.modality_id}_{task_name}: {entry}")
 
     def print_results(self):
         for modality in self.modality_ids:
@@ -274,8 +271,30 @@ def print_results(self):
                 for entry in self.results[modality][task_name]:
                     print(f"{modality}_{task_name}: {entry}")
 
+    def get_k_best_results(self, modality, k, task):
+        """
+        Get the k best results for the given modality
+        :param modality: modality to get the best results for
+        :param k: number of best results
+        """
+        items = self.results[modality.modality_id][task.model.name]
+        sorted_indices = sorted(
+            range(len(items)), key=lambda x: items[x].val_score, reverse=True
+        )[:k]
+
+        results = sorted(
+            self.results[modality.modality_id][task.model.name],
+            key=lambda x: x.val_score,
+            reverse=True,
+        )[:k]
+
+        items = list(self.cache[modality.modality_id][task.model.name].items())
+        reordered_cache = [items[i][1] for i in sorted_indices]
+
+        return results, list(reordered_cache)
+
 
-@dataclass
+@dataclass(frozen=True)
 class ResultEntry:
     val_score: float
     representations: list
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index e4240690f18..1af88282f76 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -93,34 +93,38 @@ def update_metadata(self):
             updated_md = self.modality_type.update_metadata(md_v, self.data[i])
             self.metadata[md_k] = updated_md
 
-    def flatten(self, padding=True):
+    def flatten(self, padding=False):
         """
         Flattens modality data by row-wise concatenation
         Prerequisite for some ML-models
         """
         max_len = 0
+        data = []
         for num_instance, instance in enumerate(self.data):
             if type(instance) is np.ndarray:
-                self.data[num_instance] = instance.flatten()
+                d = instance.flatten()
+                max_len = max(max_len, len(d))
+                data.append(d)
             elif isinstance(instance, List):
-                self.data[num_instance] = np.array(
+                d = np.array(
                     [item for sublist in instance for item in sublist]
                 ).flatten()
-            max_len = max(max_len, len(self.data[num_instance]))
+                max_len = max(max_len, len(d))
+                data.append(d)
 
         if padding:
-            for i, instance in enumerate(self.data):
+            for i, instance in enumerate(data):
                 if isinstance(instance, np.ndarray):
                     if len(instance) < max_len:
                         padded_data = np.zeros(max_len, dtype=instance.dtype)
                         padded_data[: len(instance)] = instance
-                        self.data[i] = padded_data
+                        data[i] = padded_data
                 else:
                     padded_data = []
                     for entry in instance:
                         padded_data.append(utils.pad_sequences(entry, max_len))
-                    self.data[i] = padded_data
-        self.data = np.array(self.data)
+                    data[i] = padded_data
+        self.data = np.array(data)
         return self
 
     def pad(self, value=0):
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index aa28499e636..8543379bc14 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -65,9 +65,7 @@ def transform(self, modality):
         embeddings = []
         for sentences in modality.data:
             tokens = list(tokenize(sentences.lower()))
-            embeddings.append(
-                np.array(get_embedding(tokens, model)).reshape(1, -1).astype(np.float32)
-            )
+            embeddings.append(np.array(get_embedding(tokens, model)).astype(np.float32))
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)

From 74b304e58f696bf8aa3463c1e3b48c9ce6b4c02c Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Fri, 15 Aug 2025 10:57:44 +0200
Subject: [PATCH 14/19] add first verion of multimodal optimizer

---
 .../scuro/drsearch/multimodal_optimizer.py    | 148 ++++++++++++------
 .../scuro/drsearch/unimodal_optimizer.py      |   4 +-
 .../systemds/scuro/modality/modality.py       |  13 +-
 .../systemds/scuro/modality/transformed.py    |  18 ++-
 4 files changed, 130 insertions(+), 53 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index ff5c431aa36..7502cf07a72 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -32,7 +32,6 @@ def __init__(
 
     def optimize(self):
         for task in self.tasks:
-            # self.optimize_intramodal_representations(task)
             self.optimize_intermodal_representations(task)
 
     def optimize_intramodal_representations(self, task):
@@ -82,29 +81,92 @@ def optimize_intermodal_representations(self, task):
         modality_combos = []
         n = len(self.k_best_cache[task.model.name])
 
-        # Generate combinations in depth-first order
         def generate_extensions(current_combo, remaining_indices):
             # Add current combination if it has at least 2 elements
             if len(current_combo) >= 2:
                 combo_tuple = tuple(i for i in current_combo)
                 modality_combos.append(combo_tuple)
-            # Generate all possible extensions
+
             for i in remaining_indices:
                 new_combo = current_combo + [i]
                 new_remaining = [j for j in remaining_indices if j > i]
                 generate_extensions(new_combo, new_remaining)
 
-        # Start with each possible first element
         for start_idx in range(n):
             remaining = list(range(start_idx + 1, n))
             generate_extensions([start_idx], remaining)
+        fusion_methods = self.operator_registry.get_fusion_operators()
+        fused_representations = []
+        reuse_fused_representations = False
+        for i, modality_combo in enumerate(modality_combos):
+            if i != 0:
+                reuse_fused_representations = self.is_prefix_match(
+                    modality_combos[i], modality_combo
+                )
+            if self.debug:
+                print(
+                    f"New modality combo: {modality_combo} - Reuse: {reuse_fused_representations} - # fused reps: {len(fused_representations)}"
+                )
+            if reuse_fused_representations:
+                mods = [
+                    self.k_best_cache[task.model.name][mod_idx]
+                    for mod_idx in modality_combo[len(modality_combos[i - 1]) :]
+                ]
+            all_mods = [
+                self.k_best_cache[task.model.name][mod_idx]
+                for mod_idx in modality_combo
+            ]
+            temp_fused_reps = []
+            for j, fusion_method in enumerate(fusion_methods):
+                # Evaluate all mods
+                fused_rep = all_mods[0].combine(all_mods[1:], fusion_method())
+                temp_fused_reps.append(fused_rep)
+                self.evaluate(
+                    task,
+                    fused_rep,
+                    [
+                        self.k_best_modalities[task.model.name][k].representations
+                        for k in modality_combo
+                    ],
+                    fusion_method,
+                    modality_combo,
+                )
+                if reuse_fused_representations:
+                    for fused_representation in fused_representations:
+                        fused_rep = fused_representation.combine(mods, fusion_method())
+                        temp_fused_reps.append(fused_rep)
+                        self.evaluate(
+                            task,
+                            fused_rep,
+                            [
+                                self.k_best_modalities[task.model.name][
+                                    k
+                                ].representations
+                                for k in modality_combo
+                            ],
+                            fusion_method,
+                            modality_combo,
+                        )
+            fused_representations = temp_fused_reps
+            reuse_fused_representations = False
+
+    def is_prefix_match(self, seq1, seq2):
+        """
+        Check if seq1 is a prefix of seq2.
 
-        print(modality_combos)
+        Args:
+            seq1: First sequence (list)
+            seq2: Second sequence (list)
 
-    def _evaluate_inter_modal(self, task, modality_combo):
-        fused_representation = None
-        for modality_id in modality_combo:
-            fused_representation = self.evaluate(task, modality_id, None, None, None)
+        Returns:
+            Boolean indicating whether seq1 is a prefix of seq2
+        """
+        # seq1 can only be a prefix if it's not longer than seq2
+        if len(seq1) > len(seq2):
+            return False
+
+        # Check if seq1 matches the beginning of seq2
+        return seq2[: len(seq1)] == seq1
 
     def extract_representations(self, representations, modality, task_name):
         applied_representations = []
@@ -151,10 +213,9 @@ def extract_representations(self, representations, modality, task_name):
                 applied_representations.append(applied_representation)
         return applied_representations
 
-    def evaluate(self, task, modality, representations, fusion, modality_ids):
+    def evaluate(self, task, modality, representations, fusion, modality_combo):
         if task.expected_dim == 1 and get_shape(modality.metadata) > 1:
             for aggregation in Aggregation().get_aggregation_functions():
-                # padding should not be necessary here
                 agg_operator = AggregatedRepresentation(Aggregation(aggregation, False))
                 agg_modality = agg_operator.transform(modality)
 
@@ -163,12 +224,20 @@ def evaluate(self, task, modality, representations, fusion, modality_ids):
                 reps.append(agg_operator)
 
                 self.optimization_results.add_result(
-                    scores, reps, [fusion], modality_ids, task.model.name
+                    scores,
+                    reps,
+                    modality.transformation,
+                    modality_combo,
+                    task.model.name,
                 )
         else:
             scores = task.run(modality.data)
             self.optimization_results.add_result(
-                scores, representations, [fusion], modality_ids, task.model.name
+                scores,
+                representations,
+                modality.transformation,
+                modality_combo,
+                task.model.name,
             )
 
     def add_to_cache(self, result_idx, combined_modality):
@@ -178,7 +247,7 @@ def extract_k_best_modalities_per_task(self):
         self.k_best_modalities = {}
         self.k_best_cache = {}
         for task in self.tasks:
-            self.k_best_modalities[task.model.name] = {}
+            self.k_best_modalities[task.model.name] = []
             self.k_best_cache[task.model.name] = []
             for modality in self.modalities:
                 k_best_results, cached_data = (
@@ -187,21 +256,9 @@ def extract_k_best_modalities_per_task(self):
                     )
                 )
 
-                self.k_best_modalities[task.model.name][
-                    modality.modality_id
-                ] = k_best_results
+                self.k_best_modalities[task.model.name].extend(k_best_results)
                 self.k_best_cache[task.model.name].extend(cached_data)
 
-    def create_modality_index(self, task):
-        counter = 0
-        k_best_idx = []
-        for modality_id, values in self.k_best_modalities[task].items():
-            for _ in values:
-                k_best_idx.append((modality_id, counter))
-                counter += 1
-
-        return k_best_idx
-
 
 class MultimodalResults:
     def __init__(self, modalities, tasks, debug, k_best_modalities):
@@ -215,35 +272,36 @@ def __init__(self, modalities, tasks, debug, k_best_modalities):
             self.results[task.model.name] = {}
 
     def add_result(
-        self, scores, best_representation_idx, fusion_methods, modality_ids, task_name
+        self, scores, best_representation_idx, fusion_methods, modality_combo, task_name
     ):
 
         entry = MultimodalResultEntry(
             representations=best_representation_idx,
             train_score=scores[0],
             val_score=scores[1],
-            fusion_methods=[fusion_method.__name__ for fusion_method in fusion_methods],
-            modality_ids=modality_ids,
+            fusion_methods=[
+                fusion_method.__class__.__name__ for fusion_method in fusion_methods
+            ],
+            modality_combo=modality_combo,
             task=task_name,
         )
 
-        modality_id_strings = "_".join(list(map(str, modality_ids)))
+        modality_id_strings = "_".join(list(map(str, modality_combo)))
         if not modality_id_strings in self.results[task_name]:
             self.results[task_name][modality_id_strings] = []
 
         self.results[task_name][modality_id_strings].append(entry)
 
+        if self.debug:
+            print(f"{modality_id_strings}_{task_name}: {entry}")
+
     def print_results(self):
         for task_name in self.task_names:
             for modality in self.results[task_name].keys():
                 for entry in self.results[task_name][modality]:
                     reps = []
-                    for i, mod_idx in enumerate(entry.modality_ids):
-                        reps.append(
-                            self.k_best_modalities[task_name][mod_idx][
-                                entry.representations[i]
-                            ]
-                        )
+                    for i, mod_idx in enumerate(entry.modality_combo):
+                        reps.append(self.k_best_modalities[task_name][mod_idx])
 
                     print(
                         f"{modality}_{task_name}: "
@@ -251,22 +309,18 @@ def print_results(self):
                     )
                     for i, rep in enumerate(reps):
                         print(
-                            f"    Representation: {entry.modality_ids[i]} - {rep.representations}"
+                            f"    Representation: {entry.modality_combo[i]} - {rep.representations}"
                         )
-                        if i < len(reps) - 1:
-                            print(f"    Fusion: {entry.fusion_methods[i]} ")
+                        # if i < len(reps) - 1:
+                    print(f"    Fusion: {entry.fusion_methods[0]} ")
 
     def store_results(self, file_name=None):
         for task_name in self.task_names:
             for modality in self.results[task_name].keys():
                 for entry in self.results[task_name][modality]:
                     reps = []
-                    for i, mod_idx in enumerate(entry.modality_ids):
-                        reps.append(
-                            self.k_best_modalities[task_name][mod_idx][
-                                entry.representations[i]
-                            ]
-                        )
+                    for i, mod_idx in enumerate(entry.modality_combo):
+                        reps.append(self.k_best_modalities[task_name][mod_idx])
                     entry.representations = reps
 
         import pickle
@@ -284,7 +338,7 @@ def store_results(self, file_name=None):
 @dataclasses.dataclass
 class MultimodalResultEntry:
     val_score: float
-    modality_ids: list
+    modality_combo: list
     representations: list
     fusion_methods: list
     train_score: float
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index b6f75162c00..1e114bb34ee 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -61,7 +61,7 @@ def store_results(self, file_name=None):
             file_name = "unimodal_optimizer" + timestr + ".pkl"
 
         with open(file_name, "wb") as f:
-            pickle.dump(self.operator_performance, f)
+            pickle.dump(self.operator_performance.results, f)
 
     def optimize_parallel(self, n_workers=None):
         if n_workers is None:
@@ -167,7 +167,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         end - start,
                     )
             else:
-                # modality.pad()
+                modality.pad()
                 for task in self.tasks:
                     start = time.time()
                     scores = task.run(modality.data)
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 1af88282f76..94e745b2cc1 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -22,6 +22,7 @@
 from typing import List
 
 import numpy as np
+from numpy.f2py.auxfuncs import throw_error
 
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.representations import utils
@@ -127,17 +128,23 @@ def flatten(self, padding=False):
         self.data = np.array(data)
         return self
 
-    def pad(self, value=0):
+    def pad(self, value=0, max_len=None):
         try:
-            result = np.array(self.data)
+            if max_len is None:
+                result = np.array(self.data)
+            else:
+                raise "Needs padding to max_len"
         except:
-            maxlen = max([len(seq) for seq in self.data])
+            maxlen = (
+                max([len(seq) for seq in self.data]) if max_len is None else max_len
+            )
 
             result = np.full((len(self.data), maxlen), value, dtype=self.data_type)
 
             for i, seq in enumerate(self.data):
                 data = seq[:maxlen]
                 result[i, : len(data)] = data
+                # TODO: add padding to metadata as attention_masks
 
         self.data = result
 
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 63ca10251c4..6523e9502fc 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -27,6 +27,7 @@
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.window_aggregation import WindowAggregation
 import time
+import copy
 
 
 class TransformedModality(Modality):
@@ -44,7 +45,22 @@ def __init__(self, modality, transformation, new_modality_type=None):
         super().__init__(
             new_modality_type, modality.modality_id, metadata, modality.data_type
         )
-        self.transformation = transformation
+        self.transformation = None
+        self.add_transformation(transformation, modality)
+
+    def add_transformation(self, transformation, modality):
+        if (
+            transformation.__class__.__bases__[0].__name__ == "Fusion"
+            and modality.transformation[0].__class__.__bases__[0].__name__ != "Fusion"
+        ):
+            self.transformation = []
+        else:
+            self.transformation = (
+                []
+                if type(modality).__name__ != "TransformedModality"
+                else copy.deepcopy(modality.transformation)
+            )
+        self.transformation.append(transformation)
 
     def copy_from_instance(self):
         return type(self)(self, self.transformation)

From 3a8f78f055373726d3e532e22a2771c6bceeda8c Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Fri, 15 Aug 2025 14:36:46 +0200
Subject: [PATCH 15/19] add caching

---
 .../scuro/drsearch/multimodal_optimizer.py    | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 7502cf07a72..0a4dd0c3489 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -23,6 +23,7 @@ def __init__(
         self.tasks = tasks
         self.k = k
         self.extract_k_best_modalities_per_task()
+        self.debug = debug
 
         self.operator_registry = Registry()
         self.optimization_results = MultimodalResults(
@@ -80,6 +81,7 @@ def optimize_intramodal_representations(self, task):
     def optimize_intermodal_representations(self, task):
         modality_combos = []
         n = len(self.k_best_cache[task.model.name])
+        reuse_cache = {}
 
         def generate_extensions(current_combo, remaining_indices):
             # Add current combination if it has at least 2 elements
@@ -99,19 +101,33 @@ def generate_extensions(current_combo, remaining_indices):
         fused_representations = []
         reuse_fused_representations = False
         for i, modality_combo in enumerate(modality_combos):
+            # clear reuse cache
+            
             if i != 0:
                 reuse_fused_representations = self.is_prefix_match(
-                    modality_combos[i], modality_combo
-                )
-            if self.debug:
-                print(
-                    f"New modality combo: {modality_combo} - Reuse: {reuse_fused_representations} - # fused reps: {len(fused_representations)}"
+                    modality_combos[i-1], modality_combo
                 )
             if reuse_fused_representations:
                 mods = [
                     self.k_best_cache[task.model.name][mod_idx]
                     for mod_idx in modality_combo[len(modality_combos[i - 1]) :]
                 ]
+                fused_representations = reuse_cache[modality_combos[i - 1]]
+            else:
+                prefix_idx = self.compute_equal_prefix_index(modality_combos[i-1], modality_combo)
+                if prefix_idx > 1:
+                    fused_representations = reuse_cache[modality_combos[i - 1][:prefix_idx]]
+                    reuse_fused_representations = True
+                    mods = [
+                        self.k_best_cache[task.model.name][mod_idx]
+                        for mod_idx in modality_combo[prefix_idx:]
+                    ]
+            if self.debug:
+                print(
+                    f"New modality combo: {modality_combo} - Reuse: {reuse_fused_representations} - # fused reps: {len(fused_representations)}"
+                )
+                
+                
             all_mods = [
                 self.k_best_cache[task.model.name][mod_idx]
                 for mod_idx in modality_combo
@@ -147,7 +163,9 @@ def generate_extensions(current_combo, remaining_indices):
                             fusion_method,
                             modality_combo,
                         )
-            fused_representations = temp_fused_reps
+        
+            if len(modality_combo) < len(self.k_best_cache[task.model.name]) and i +1 < len(modality_combos) and self.is_prefix_match(modality_combos[i], modality_combos[i+1]):
+                reuse_cache[modality_combo] = temp_fused_reps
             reuse_fused_representations = False
 
     def is_prefix_match(self, seq1, seq2):
@@ -162,11 +180,21 @@ def is_prefix_match(self, seq1, seq2):
             Boolean indicating whether seq1 is a prefix of seq2
         """
         # seq1 can only be a prefix if it's not longer than seq2
+        
         if len(seq1) > len(seq2):
             return False
 
         # Check if seq1 matches the beginning of seq2
         return seq2[: len(seq1)] == seq1
+    
+    
+    def compute_equal_prefix_index(self, seq1, seq2):
+        max_len = min(len(seq1), len(seq2))
+        i = 0
+        while i < max_len and seq1[i] == seq2[i]:
+            i += 1
+            
+        return i
 
     def extract_representations(self, representations, modality, task_name):
         applied_representations = []

From ccfb8c250ba89afab2eb7e509598b7c08b6061c4 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 18 Aug 2025 12:47:42 +0200
Subject: [PATCH 16/19] add multimodal fusion optimizer

---
 src/main/python/systemds/scuro/__init__.py    |  4 +
 .../systemds/scuro/dataloader/audio_loader.py | 24 +++---
 .../systemds/scuro/dataloader/video_loader.py | 22 ++++--
 .../scuro/drsearch/multimodal_optimizer.py    | 53 +++++++------
 .../scuro/drsearch/unimodal_optimizer.py      |  2 +-
 .../scuro/representations/aggregate.py        |  2 +-
 .../systemds/scuro/representations/average.py | 19 ++---
 .../systemds/scuro/representations/bert.py    |  3 +-
 .../systemds/scuro/representations/bow.py     |  3 +-
 .../scuro/representations/concatenation.py    | 22 +-----
 .../systemds/scuro/representations/fusion.py  | 17 ++++
 .../systemds/scuro/representations/glove.py   |  1 +
 .../scuro/representations/hadamard.py         |  3 +-
 .../systemds/scuro/representations/lstm.py    |  2 +-
 .../systemds/scuro/representations/max.py     |  4 +-
 .../scuro/representations/spectrogram.py      |  2 +-
 .../systemds/scuro/representations/sum.py     | 11 +--
 .../systemds/scuro/representations/tfidf.py   |  1 +
 .../scuro/representations/word2vec.py         |  3 +-
 src/main/python/tests/scuro/data_generator.py | 12 +++
 src/main/python/tests/scuro/test_dr_search.py |  4 +-
 .../python/tests/scuro/test_fusion_orders.py  |  2 +-
 .../scuro/test_fusion_representations.py      | 78 +++++++++++++++++++
 .../tests/scuro/test_multimodal_fusion.py     | 68 +++++++++-------
 .../tests/scuro/test_unimodal_optimizer.py    | 13 ++--
 25 files changed, 248 insertions(+), 127 deletions(-)
 create mode 100644 src/main/python/tests/scuro/test_fusion_representations.py

diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
index 1c3cfe92231..ae9aed44c0a 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -73,6 +73,8 @@
 from systemds.scuro.drsearch.unimodal_representation_optimizer import (
     UnimodalRepresentationOptimizer,
 )
+from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
+from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
 
 __all__ = [
@@ -127,4 +129,6 @@
     "OptimizationData",
     "RepresentationCache",
     "UnimodalRepresentationOptimizer",
+    "UnimodalOptimizer",
+    "MultimodalOptimizer",
 ]
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index a1dad304e53..1197617673f 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -45,18 +45,18 @@ def __init__(
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
-        # if not self.load_data_from_file:
-        #     import numpy as np
-        #
-        #     self.metadata[file] = self.modality_type.create_audio_metadata(
-        #         1000, np.array([0])
-        #     )
-        # else:
-        audio, sr = librosa.load(file, dtype=self._data_type)
+        if not self.load_data_from_file:
+            import numpy as np
 
-        if self.normalize:
-            audio = librosa.util.normalize(audio)
+            self.metadata[file] = self.modality_type.create_audio_metadata(
+                1000, np.array([0])
+            )
+        else:
+            audio, sr = librosa.load(file, dtype=self._data_type)
 
-        self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio)
+            if self.normalize:
+                audio = librosa.util.normalize(audio)
 
-        self.data.append(audio)
+            self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio)
+
+            self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 96ea5f11f69..0e77d5dc57b 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -35,11 +35,13 @@ def __init__(
         data_type: Union[np.dtype, str] = np.float16,
         chunk_size: Optional[int] = None,
         load=True,
+        fps=None,
     ):
         super().__init__(
             source_path, indices, data_type, chunk_size, ModalityType.VIDEO
         )
         self.load_data_from_file = load
+        self.fps = fps
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
@@ -53,25 +55,33 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         if not cap.isOpened():
             raise f"Could not read video at path: {file}"
 
-        fps = cap.get(cv2.CAP_PROP_FPS)
+        orig_fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_interval = 1
+        if self.fps is not None and self.fps < orig_fps:
+            frame_interval = int(round(orig_fps / self.fps))
+        else:
+            self.fps = orig_fps
+
         length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         num_channels = 3
 
         self.metadata[file] = self.modality_type.create_video_metadata(
-            fps, length, width, height, num_channels
+            self.fps, length, width, height, num_channels
         )
 
         frames = []
+        idx = 0
         while cap.isOpened():
             ret, frame = cap.read()
 
             if not ret:
                 break
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = frame.astype(self._data_type) / 255.0
-
-            frames.append(frame)
+            if idx % frame_interval == 0:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frame = frame.astype(self._data_type) / 255.0
+                frames.append(frame)
+            idx += 1
 
         self.data.append(np.stack(frames))
diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 0a4dd0c3489..fdda2b6cd20 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -102,10 +102,12 @@ def generate_extensions(current_combo, remaining_indices):
         reuse_fused_representations = False
         for i, modality_combo in enumerate(modality_combos):
             # clear reuse cache
-            
+            if i % 5 == 0:
+                reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache)
+
             if i != 0:
                 reuse_fused_representations = self.is_prefix_match(
-                    modality_combos[i-1], modality_combo
+                    modality_combos[i - 1], modality_combo
                 )
             if reuse_fused_representations:
                 mods = [
@@ -114,9 +116,13 @@ def generate_extensions(current_combo, remaining_indices):
                 ]
                 fused_representations = reuse_cache[modality_combos[i - 1]]
             else:
-                prefix_idx = self.compute_equal_prefix_index(modality_combos[i-1], modality_combo)
+                prefix_idx = self.compute_equal_prefix_index(
+                    modality_combos[i - 1], modality_combo
+                )
                 if prefix_idx > 1:
-                    fused_representations = reuse_cache[modality_combos[i - 1][:prefix_idx]]
+                    fused_representations = reuse_cache[
+                        modality_combos[i - 1][:prefix_idx]
+                    ]
                     reuse_fused_representations = True
                     mods = [
                         self.k_best_cache[task.model.name][mod_idx]
@@ -126,8 +132,7 @@ def generate_extensions(current_combo, remaining_indices):
                 print(
                     f"New modality combo: {modality_combo} - Reuse: {reuse_fused_representations} - # fused reps: {len(fused_representations)}"
                 )
-                
-                
+
             all_mods = [
                 self.k_best_cache[task.model.name][mod_idx]
                 for mod_idx in modality_combo
@@ -163,37 +168,37 @@ def generate_extensions(current_combo, remaining_indices):
                             fusion_method,
                             modality_combo,
                         )
-        
-            if len(modality_combo) < len(self.k_best_cache[task.model.name]) and i +1 < len(modality_combos) and self.is_prefix_match(modality_combos[i], modality_combos[i+1]):
+
+            if (
+                len(modality_combo) < len(self.k_best_cache[task.model.name])
+                and i + 1 < len(modality_combos)
+                and self.is_prefix_match(modality_combos[i], modality_combos[i + 1])
+            ):
                 reuse_cache[modality_combo] = temp_fused_reps
             reuse_fused_representations = False
 
+    def prune_cache(self, sequences, cache):
+        seqs_as_tuples = [tuple(seq) for seq in sequences]
+
+        def still_used(key):
+            return any(self.is_prefix_match(key, seq) for seq in seqs_as_tuples)
+
+        cache = {key: value for key, value in cache.items() if still_used(key)}
+        return cache
+
     def is_prefix_match(self, seq1, seq2):
-        """
-        Check if seq1 is a prefix of seq2.
-
-        Args:
-            seq1: First sequence (list)
-            seq2: Second sequence (list)
-
-        Returns:
-            Boolean indicating whether seq1 is a prefix of seq2
-        """
-        # seq1 can only be a prefix if it's not longer than seq2
-        
         if len(seq1) > len(seq2):
             return False
 
         # Check if seq1 matches the beginning of seq2
         return seq2[: len(seq1)] == seq1
-    
-    
+
     def compute_equal_prefix_index(self, seq1, seq2):
         max_len = min(len(seq1), len(seq2))
         i = 0
         while i < max_len and seq1[i] == seq2[i]:
             i += 1
-            
+
         return i
 
     def extract_representations(self, representations, modality, task_name):
@@ -339,7 +344,7 @@ def print_results(self):
                         print(
                             f"    Representation: {entry.modality_combo[i]} - {rep.representations}"
                         )
-                        # if i < len(reps) - 1:
+
                     print(f"    Fusion: {entry.fusion_methods[0]} ")
 
     def store_results(self, file_name=None):
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 1e114bb34ee..030f04aa431 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -29,7 +29,7 @@
 import numpy as np
 from systemds.scuro.representations.window_aggregation import WindowAggregation
 
-from build.lib.systemds.scuro.representations.aggregated_representation import (
+from systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
 from systemds.scuro import ModalityType, Aggregation
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py b/src/main/python/systemds/scuro/representations/aggregate.py
index 1e73c81696d..2c046dc4016 100644
--- a/src/main/python/systemds/scuro/representations/aggregate.py
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -92,7 +92,7 @@ def execute(self, modality):
                         padded_data.append(utils.pad_sequences(entry, max_len))
                     data[i] = padded_data
 
-        return data
+        return np.array(data)
 
     def transform(self, modality):
         return self.execute(modality)
diff --git a/src/main/python/systemds/scuro/representations/average.py b/src/main/python/systemds/scuro/representations/average.py
index 8a7e6b9ec8e..ac51f5d1e8d 100644
--- a/src/main/python/systemds/scuro/representations/average.py
+++ b/src/main/python/systemds/scuro/representations/average.py
@@ -18,7 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-
+import copy
 from typing import List
 
 import numpy as np
@@ -37,23 +37,14 @@ def __init__(self):
         Combines modalities using averaging
         """
         super().__init__("Average")
+        self.needs_alignment = True
         self.associative = True
         self.commutative = True
 
-    def transform(self, modalities: List[Modality]):
-        for modality in modalities:
-            modality.flatten()
-
-        max_emb_size = self.get_max_embedding_size(modalities)
-
-        padded_modalities = []
-        for modality in modalities:
-            d = pad_sequences(modality.data, maxlen=max_emb_size, dtype="float32")
-            padded_modalities.append(d)
-
-        data = padded_modalities[0]
+    def execute(self, modalities: List[Modality]):
+        data = copy.deepcopy(modalities[0].data)
         for i in range(1, len(modalities)):
-            data += padded_modalities[i]
+            data += modalities[i].data
 
         data /= len(modalities)
 
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index e6611f0b7c9..3478b84e672 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -18,7 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-
+import numpy as np
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
@@ -56,6 +56,7 @@ def transform(self, modality):
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
 
+        transformed_modality.data_type = np.float32
         transformed_modality.data = embeddings
         return transformed_modality
 
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index 6778811c49c..7cfddbb506f 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -18,7 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-
+import numpy as np
 from sklearn.feature_extraction.text import CountVectorizer
 
 from systemds.scuro.modality.transformed import TransformedModality
@@ -50,5 +50,6 @@ def transform(self, modality):
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
 
+        transformed_modality.data_type = np.float32
         transformed_modality.data = X
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py
index c7ce33ab5c7..a4d4d53c43e 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -20,7 +20,7 @@
 # -------------------------------------------------------------
 
 from typing import List
-
+import copy
 import numpy as np
 
 from systemds.scuro.modality.modality import Modality
@@ -33,14 +33,13 @@
 
 @register_fusion_operator()
 class Concatenation(Fusion):
-    def __init__(self, padding=True):
+    def __init__(self):
         """
         Combines modalities using concatenation
         """
         super().__init__("Concatenation")
-        self.padding = padding
 
-    def transform(self, modalities: List[Modality]):
+    def execute(self, modalities: List[Modality]):
         if len(modalities) == 1:
             return np.array(modalities[0].data)
 
@@ -53,19 +52,6 @@ def transform(self, modalities: List[Modality]):
             data = np.zeros((size, 0))
 
         for modality in modalities:
-            if self.padding:
-                data = np.concatenate(
-                    [
-                        data,
-                        pad_sequences(
-                            modality.data,
-                            maxlen=max_emb_size,
-                            dtype=modality.data.dtype,
-                        ),
-                    ],
-                    axis=-1,
-                )
-            else:
-                data = np.concatenate([data, modality.data], axis=-1)
+            data = np.concatenate([data, copy.deepcopy(modality.data)], axis=-1)
 
         return np.array(data)
diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
index cbbb5606e6d..4b746eee219 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -21,9 +21,11 @@
 from typing import List
 
 import numpy as np
+from systemds.scuro import AggregatedRepresentation, Aggregation
 
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.representation import Representation
+from systemds.scuro.utils.schema_helpers import get_shape
 
 
 class Fusion(Representation):
@@ -44,6 +46,21 @@ def transform(self, modalities: List[Modality]):
         :param modalities: List of modalities used in the fusion
         :return: fused data
         """
+        mods = []
+        for modality in modalities:
+            agg_modality = None
+            if get_shape(modality.metadata) > 1:
+                agg_operator = AggregatedRepresentation(Aggregation())
+                agg_modality = agg_operator.transform(modality)
+            mods.append(agg_modality if agg_modality else modality)
+
+        if self.needs_alignment:
+            max_len = self.get_max_embedding_size(mods)
+            for modality in mods:
+                modality.pad(max_len=max_len)
+        return self.execute(mods)
+
+    def execute(self, modalities: List[Modality]):
         raise f"Not implemented for Fusion: {self.name}"
 
     def get_max_embedding_size(self, modalities: List[Modality]):
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
index d948567f3f5..9076efecfc9 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -67,5 +67,6 @@ def transform(self, modality):
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
 
+        transformed_modality.data_type = np.float32
         transformed_modality.data = np.array(embeddings)
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/hadamard.py b/src/main/python/systemds/scuro/representations/hadamard.py
index 138003b8741..a777768ff6e 100644
--- a/src/main/python/systemds/scuro/representations/hadamard.py
+++ b/src/main/python/systemds/scuro/representations/hadamard.py
@@ -41,8 +41,7 @@ def __init__(self):
         self.commutative = True
         self.associative = True
 
-    def transform(self, modalities: List[Modality], train_indices=None):
-        # TODO: check for alignment in the metadata
+    def execute(self, modalities: List[Modality], train_indices=None):
         fused_data = np.prod([m.data for m in modalities], axis=0)
 
         return fused_data
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index cbab0f68978..0cfafddefa9 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -64,7 +64,7 @@ def transform(self, modalities: List[Modality]):
         result = np.zeros((size, 0))
 
         for modality in modalities:
-            if modality.modality_type in self.unimodal_embeddings.keys():
+            if modality.modality_type in list(self.unimodal_embeddings.keys()):
                 out = self.unimodal_embeddings.get(modality.modality_type)
             else:
                 out = self.run_lstm(modality.data)
diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py
index 6ecf5fd52f3..39f5069c2b5 100644
--- a/src/main/python/systemds/scuro/representations/max.py
+++ b/src/main/python/systemds/scuro/representations/max.py
@@ -40,11 +40,9 @@ def __init__(self):
         self.associative = True
         self.commutative = True
 
-    def transform(
+    def execute(
         self,
         modalities: List[Modality],
     ):
-        # TODO: need to check if data is aligned - same number of dimension
         fused_data = np.maximum.reduce([m.data for m in modalities])
-
         return fused_data
diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py
index 5fb1780536d..8daa9abb015 100644
--- a/src/main/python/systemds/scuro/representations/spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/spectrogram.py
@@ -48,7 +48,7 @@ def transform(self, modality):
             ).astype(modality.data_type)
             S_dB = librosa.amplitude_to_db(np.abs(spectrogram))
 
-            result.append(S_dB.T)
+            result.append(S_dB.T.reshape(-1))
 
         transformed_modality.data = result
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/sum.py b/src/main/python/systemds/scuro/representations/sum.py
index 46d93f2eda0..5b3710b6e14 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/representations/sum.py
@@ -37,15 +37,12 @@ def __init__(self):
         Combines modalities using colum-wise sum
         """
         super().__init__("Sum")
+        self.needs_alignment = True
 
-    def transform(self, modalities: List[Modality]):
-        max_emb_size = self.get_max_embedding_size(modalities)
-
-        data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32")
+    def execute(self, modalities: List[Modality]):
+        data = modalities[0].data
 
         for m in range(1, len(modalities)):
-            data += pad_sequences(
-                modalities[m].data, maxlen=max_emb_size, dtype="float32"
-            )
+            data += modalities[m].data
 
         return data
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index c26039f46a9..3b8f069df83 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -47,5 +47,6 @@ def transform(self, modality):
         if self.output_file is not None:
             save_embeddings(X, self.output_file)
 
+        transformed_modality.data_type = np.float32
         transformed_modality.data = X
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 8543379bc14..88d60ac828b 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -69,5 +69,6 @@ def transform(self, modality):
 
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
-        transformed_modality.data = embeddings
+        transformed_modality.data_type = np.float32
+        transformed_modality.data = np.array(embeddings)
         return transformed_modality
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index e2dceec329d..e57716fa99d 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -193,6 +193,18 @@ def create_visual_modality(
 
         return (data, metadata)
 
+    def create_balanced_labels(self, num_instances, num_classes=2):
+        if num_instances % num_classes != 0:
+            raise ValueError("Size must be even to have equal numbers of classes.")
+
+        class_size = int(num_instances / num_classes)
+        vector = np.array([0] * class_size)
+        for i in range(num_classes - 1):
+            vector = np.concatenate((vector, np.array([1] * class_size)))
+
+        np.random.shuffle(vector)
+        return vector
+
 
 def setup_data(modalities, num_instances, path):
     if os.path.isdir(path):
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index 50f57eebb20..3e0e702e6f3 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -94,7 +94,9 @@ def setUpClass(cls):
         cls.num_instances = 20
         cls.data_generator = ModalityRandomDataGenerator()
 
-        cls.labels = np.random.choice([0, 1], size=cls.num_instances)
+        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
+            num_instances=cls.num_instances
+        )
         # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead
 
         cls.video = cls.data_generator.create1DModality(
diff --git a/src/main/python/tests/scuro/test_fusion_orders.py b/src/main/python/tests/scuro/test_fusion_orders.py
index eb01d18ffe4..22d64bcc0bf 100644
--- a/src/main/python/tests/scuro/test_fusion_orders.py
+++ b/src/main/python/tests/scuro/test_fusion_orders.py
@@ -65,7 +65,7 @@ def test_fusion_order_concat(self):
 
         self.assertFalse(np.array_equal(r_1_r_2.data, r_2_r_1.data))
         self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data))
-        self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data))
+        self.assertFalse(np.array_equal(r_2_r_1.data, r1_r2_r3.data))
         self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data))
 
     def test_fusion_order_max(self):
diff --git a/src/main/python/tests/scuro/test_fusion_representations.py b/src/main/python/tests/scuro/test_fusion_representations.py
new file mode 100644
index 00000000000..6aaeb2a4fbe
--- /dev/null
+++ b/src/main/python/tests/scuro/test_fusion_representations.py
@@ -0,0 +1,78 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import unittest
+
+import numpy as np
+from systemds.scuro.representations.window_aggregation import WindowAggregation
+from systemds.scuro.representations.aggregate import Aggregation
+from systemds.scuro.representations.wav2vec import Wav2Vec
+from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.representations.spectrogram import Spectrogram
+from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.mfcc import MFCC
+from systemds.scuro.representations.multimodal_attention_fusion import (
+    MultiModalAttentionFusion,
+    AttentionFusion,
+)
+from systemds.scuro.representations.resnet import ResNet
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader
+
+
+class TestFusionOrders(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.num_instances = 40
+        cls.indices = np.array(range(cls.num_instances))
+        audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+            cls.num_instances, 100
+        )
+        text_data, text_md = ModalityRandomDataGenerator().create_text_data(
+            cls.num_instances
+        )
+        video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
+            cls.num_instances, 60
+        )
+        cls.audio = UnimodalModality(
+            TestDataLoader(
+                cls.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+            )
+        )
+        cls.video = UnimodalModality(
+            TestDataLoader(
+                cls.indices, 10, ModalityType.VIDEO, video_data, np.float32, video_md
+            )
+        )
+        cls.text = UnimodalModality(
+            TestDataLoader(
+                cls.indices, None, ModalityType.TEXT, text_data, str, text_md
+            )
+        )
+
+    def test_attention(self):
+        r_a = self.audio.apply_representation(MelSpectrogram())
+        r_t = self.text.apply_representation(TfIdf())
+        r_v = self.video.apply_representation(ResNet())
+
+        fused = AttentionFusion().transform([r_a, r_v, r_t])
diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py
index 77f03054eb5..ae3ddedffb1 100644
--- a/src/main/python/tests/scuro/test_multimodal_fusion.py
+++ b/src/main/python/tests/scuro/test_multimodal_fusion.py
@@ -22,12 +22,15 @@
 
 import shutil
 import unittest
+from multiprocessing import freeze_support
 
 import numpy as np
 from sklearn import svm
 from sklearn.metrics import classification_report
 from sklearn.model_selection import train_test_split
 
+from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
+from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 from systemds.scuro.representations.concatenation import Concatenation
 from systemds.scuro.representations.average import Average
 from systemds.scuro.drsearch.fusion_optimizer import FusionOptimizer
@@ -115,7 +118,9 @@ class TestMultimodalRepresentationOptimizer(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = np.random.choice([0, 1], size=cls.num_instances)
+        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
+            num_instances=cls.num_instances
+        )
         cls.indices = np.array(range(cls.num_instances))
 
         split = train_test_split(
@@ -123,31 +128,15 @@ def setUpClass(cls):
             cls.labels,
             test_size=0.2,
             random_state=42,
+            stratify=cls.labels,
         )
         cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
             int(i) for i in split[1]
         ]
 
-        cls.tasks = [
-            Task(
-                "UnimodalRepresentationTask1",
-                TestSVM(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
-            Task(
-                "UnimodalRepresentationTask2",
-                TestCNN(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
-        ]
-
     def test_multimodal_fusion(self):
         task = Task(
-            "UnimodalRepresentationTask1",
+            "MM_Fusion_Task1",
             TestSVM(),
             self.labels,
             self.train_indizes,
@@ -192,22 +181,47 @@ def test_multimodal_fusion(self):
         ):
             registry = Registry()
             registry._fusion_operators = [Average, Concatenation]
-            unimodal_optimizer = UnimodalRepresentationOptimizer(
-                [text, audio, video], [task], max_chain_depth=2
+            unimodal_optimizer = UnimodalOptimizer(
+                [audio, text, video], [task], debug=False
             )
             unimodal_optimizer.optimize()
+            unimodal_optimizer.operator_performance.get_k_best_results(audio, 2, task)
 
-            multimodal_optimizer = FusionOptimizer(
+            multimodal_optimizer = MultimodalOptimizer(
                 [audio, text, video],
-                task,
-                unimodal_optimizer.optimization_results,
-                unimodal_optimizer.cache,
-                2,
-                2,
+                unimodal_optimizer.operator_performance,
+                [task],
                 debug=False,
             )
+
             multimodal_optimizer.optimize()
 
+            assert (
+                len(multimodal_optimizer.optimization_results.results["TestSVM"].keys())
+                == 57
+            )
+            assert (
+                len(
+                    multimodal_optimizer.optimization_results.results["TestSVM"][
+                        "0_1_2_3_4_5"
+                    ]
+                )
+                == 62
+            )
+            assert (
+                len(
+                    multimodal_optimizer.optimization_results.results["TestSVM"][
+                        "3_4_5"
+                    ]
+                )
+                == 6
+            )
+            assert (
+                len(multimodal_optimizer.optimization_results.results["TestSVM"]["0_1"])
+                == 2
+            )
+
 
 if __name__ == "__main__":
+    freeze_support()
     unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index 192567e92ee..a73d7b5fcc1 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -104,7 +104,9 @@ class TestUnimodalRepresentationOptimizer(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = np.random.choice([0, 1], size=cls.num_instances)
+        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
+            num_instances=cls.num_instances
+        )
         cls.indices = np.array(range(cls.num_instances))
 
         split = train_test_split(
@@ -182,7 +184,7 @@ def optimize_unimodal_representation_for_modality(self, modality):
         ):
             registry = Registry()
 
-            unimodal_optimizer = UnimodalOptimizer([modality], self.tasks)
+            unimodal_optimizer = UnimodalOptimizer([modality], self.tasks, False)
             unimodal_optimizer.optimize()
 
             assert (
@@ -190,10 +192,11 @@ def optimize_unimodal_representation_for_modality(self, modality):
                 == modality.modality_id
             )
             assert len(unimodal_optimizer.operator_performance.task_names) == 2
-            assert (
-                len(unimodal_optimizer.get_k_best_results(modality, 1, self.tasks[0]))
-                == 1
+            result, cached = unimodal_optimizer.operator_performance.get_k_best_results(
+                modality, 1, self.tasks[0]
             )
+            assert len(result) == 1
+            assert len(cached) == 1
 
 
 if __name__ == "__main__":

From e1711b433f279574b6690d3e152d1e5e5762d8e5 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 18 Aug 2025 12:55:25 +0200
Subject: [PATCH 17/19] remove test

---
 .../scuro/test_fusion_representations.py      | 78 -------------------
 1 file changed, 78 deletions(-)
 delete mode 100644 src/main/python/tests/scuro/test_fusion_representations.py

diff --git a/src/main/python/tests/scuro/test_fusion_representations.py b/src/main/python/tests/scuro/test_fusion_representations.py
deleted file mode 100644
index 6aaeb2a4fbe..00000000000
--- a/src/main/python/tests/scuro/test_fusion_representations.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import unittest
-
-import numpy as np
-from systemds.scuro.representations.window_aggregation import WindowAggregation
-from systemds.scuro.representations.aggregate import Aggregation
-from systemds.scuro.representations.wav2vec import Wav2Vec
-from systemds.scuro.representations.tfidf import TfIdf
-from systemds.scuro.representations.spectrogram import Spectrogram
-from systemds.scuro.representations.bow import BoW
-from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
-from systemds.scuro.representations.bert import Bert
-from systemds.scuro.representations.mfcc import MFCC
-from systemds.scuro.representations.multimodal_attention_fusion import (
-    MultiModalAttentionFusion,
-    AttentionFusion,
-)
-from systemds.scuro.representations.resnet import ResNet
-from systemds.scuro.modality.unimodal_modality import UnimodalModality
-from systemds.scuro.modality.type import ModalityType
-from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader
-
-
-class TestFusionOrders(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.num_instances = 40
-        cls.indices = np.array(range(cls.num_instances))
-        audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
-            cls.num_instances, 100
-        )
-        text_data, text_md = ModalityRandomDataGenerator().create_text_data(
-            cls.num_instances
-        )
-        video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
-            cls.num_instances, 60
-        )
-        cls.audio = UnimodalModality(
-            TestDataLoader(
-                cls.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
-            )
-        )
-        cls.video = UnimodalModality(
-            TestDataLoader(
-                cls.indices, 10, ModalityType.VIDEO, video_data, np.float32, video_md
-            )
-        )
-        cls.text = UnimodalModality(
-            TestDataLoader(
-                cls.indices, None, ModalityType.TEXT, text_data, str, text_md
-            )
-        )
-
-    def test_attention(self):
-        r_a = self.audio.apply_representation(MelSpectrogram())
-        r_t = self.text.apply_representation(TfIdf())
-        r_v = self.video.apply_representation(ResNet())
-
-        fused = AttentionFusion().transform([r_a, r_v, r_t])

From 02ceb15cf2782ed8db1bca92d9e2ec9f3b4e37ab Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 18 Aug 2025 12:59:50 +0200
Subject: [PATCH 18/19] add missing header

---
 .../systemds/scuro/utils/static_variables.py  | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/main/python/systemds/scuro/utils/static_variables.py b/src/main/python/systemds/scuro/utils/static_variables.py
index b1b3e657a24..8237cdf1b3e 100644
--- a/src/main/python/systemds/scuro/utils/static_variables.py
+++ b/src/main/python/systemds/scuro/utils/static_variables.py
@@ -1,3 +1,23 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
 import numpy as np
 import torch
 

From a6f21e0aff51b5e0730882d764a02db0c51986e7 Mon Sep 17 00:00:00 2001
From: Christina Dionysio <dionysio@tu-berlin.de>
Date: Mon, 18 Aug 2025 13:22:00 +0200
Subject: [PATCH 19/19] add missing header

---
 .../scuro/drsearch/multimodal_optimizer.py    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index fdda2b6cd20..2da8e7ae195 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -1,3 +1,23 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
 import itertools
 
 from systemds.scuro.representations.aggregated_representation import (