diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
index ae9aed44c0a..b2a5e9df377 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -38,6 +38,9 @@
 from systemds.scuro.representations.lstm import LSTM
 from systemds.scuro.representations.max import RowMax
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
+from systemds.scuro.representations.multimodal_attention_fusion import (
+    AttentionFusion,
+)
 from systemds.scuro.representations.mfcc import MFCC
 from systemds.scuro.representations.hadamard import Hadamard
 from systemds.scuro.representations.optical_flow import OpticalFlow
@@ -73,6 +76,12 @@
 from systemds.scuro.drsearch.unimodal_representation_optimizer import (
     UnimodalRepresentationOptimizer,
 )
+from systemds.scuro.representations.covarep_audio_features import (
+    RMSE,
+    Spectral,
+    ZeroCrossing,
+    Pitch,
+)
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
@@ -131,4 +140,9 @@
     "UnimodalRepresentationOptimizer",
     "UnimodalOptimizer",
     "MultimodalOptimizer",
+    "ZeroCrossing",
+    "Pitch",
+    "RMSE",
+    "Spectral",
+    "AttentionFusion",
 ]
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index 1197617673f..d8080c607d0 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 from typing import List, Optional, Union
-
 import librosa
 import numpy as np
 
diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
index 2da8e7ae195..ac4365ed5c6 100644
--- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
@@ -98,6 +98,9 @@ def optimize_intramodal_representations(self, task):
                             ],
                         )
 
+    # TODO: check if order matters for reused reps - only compute once - check in cache
+    # TODO: parallelize - whenever an item of len 0 comes along give it to a new thread - merge results
+    # TODO: change the algorithm so that one representation is used until there is no more representations to add - saves a lot of memory
     def optimize_intermodal_representations(self, task):
         modality_combos = []
         n = len(self.k_best_cache[task.model.name])
@@ -122,8 +125,7 @@ def generate_extensions(current_combo, remaining_indices):
         reuse_fused_representations = False
         for i, modality_combo in enumerate(modality_combos):
             # clear reuse cache
-            if i % 5 == 0:
-                reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache)
+            reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache)
 
             if i != 0:
                 reuse_fused_representations = self.is_prefix_match(
diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py
index 3909b51ff98..699dcad8571 100644
--- a/src/main/python/systemds/scuro/drsearch/operator_registry.py
+++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -43,6 +43,12 @@ def __new__(cls):
                 cls._representations[m_type] = []
         return cls._instance
 
+    def set_fusion_operators(self, fusion_operators):
+        if isinstance(fusion_operators, list):
+            self._context_operators = fusion_operators
+        else:
+            self._fusion_operators = [fusion_operators]
+
     def add_representation(
         self, representation: Representation, modality: ModalityType
     ):
@@ -57,6 +63,13 @@ def add_fusion_operator(self, fusion_operator):
     def get_representations(self, modality: ModalityType):
         return self._representations[modality]
 
+    def get_not_self_contained_representations(self, modality: ModalityType):
+        reps = []
+        for rep in self.get_representations(modality):
+            if not rep().self_contained:
+                reps.append(rep)
+        return reps
+
     def get_context_operators(self):
         # TODO: return modality specific context operations
         return self._context_operators
diff --git a/src/main/python/systemds/scuro/drsearch/task.py b/src/main/python/systemds/scuro/drsearch/task.py
index 7e05a489e44..d08844c7bb6 100644
--- a/src/main/python/systemds/scuro/drsearch/task.py
+++ b/src/main/python/systemds/scuro/drsearch/task.py
@@ -19,8 +19,10 @@
 #
 # -------------------------------------------------------------
 import time
-from typing import List
+from typing import List, Union
 
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.representation import Representation
 from systemds.scuro.models.model import Model
 import numpy as np
 from sklearn.model_selection import KFold
@@ -57,6 +59,8 @@ def __init__(
         self.inference_time = []
         self.training_time = []
         self.expected_dim = 1
+        self.train_scores = []
+        self.val_scores = []
 
     def get_train_test_split(self, data):
         X_train = [data[i] for i in self.train_indices]
@@ -73,28 +77,69 @@ def run(self, data):
          :param data: The aligned data used in the prediction process
          :return: the validation accuracy
         """
+        self._reset_params()
+        skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)
+
+        fold = 0
+        X, y, _, _ = self.get_train_test_split(data)
+
+        for train, test in skf.split(X, y):
+            train_X = np.array(X)[train]
+            train_y = np.array(y)[train]
+            test_X = np.array(X)[test]
+            test_y = np.array(y)[test]
+            self._run_fold(train_X, train_y, test_X, test_y)
+            fold += 1
+
+        if self.measure_performance:
+            self.inference_time = np.mean(self.inference_time)
+            self.training_time = np.mean(self.training_time)
+
+        return [np.mean(self.train_scores), np.mean(self.val_scores)]
+
+    def _reset_params(self):
         self.inference_time = []
         self.training_time = []
+        self.train_scores = []
+        self.val_scores = []
+
+    def _run_fold(self, train_X, train_y, test_X, test_y):
+        train_start = time.time()
+        train_score = self.model.fit(train_X, train_y, test_X, test_y)
+        train_end = time.time()
+        self.training_time.append(train_end - train_start)
+        self.train_scores.append(train_score)
+        test_start = time.time()
+        test_score = self.model.test(np.array(test_X), test_y)
+        test_end = time.time()
+        self.inference_time.append(test_end - test_start)
+        self.val_scores.append(test_score)
+
+    def create_representation_and_run(
+        self,
+        representation: Representation,
+        modalities: Union[List[Modality], Modality],
+    ):
+        self._reset_params()
         skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)
-        train_scores = []
-        test_scores = []
+
         fold = 0
-        X, y, X_test, y_test = self.get_train_test_split(data)
+        X, y, _, _ = self.get_train_test_split(data)
 
         for train, test in skf.split(X, y):
             train_X = np.array(X)[train]
             train_y = np.array(y)[train]
-            train_start = time.time()
-            train_score = self.model.fit(train_X, train_y, X_test, y_test)
-            train_end = time.time()
-            self.training_time.append(train_end - train_start)
-            train_scores.append(train_score)
-            test_start = time.time()
-            test_score = self.model.test(np.array(X_test), y_test)
-            test_end = time.time()
-            self.inference_time.append(test_end - test_start)
-            test_scores.append(test_score)
+            test_X = s.transform(np.array(X)[test])
+            test_y = np.array(y)[test]
+
+            if isinstance(modalities, Modality):
+                rep = modality.apply_representation(representation())
+            else:
+                representation().transform(
+                    train_X, train_y
+                )  # TODO: think about a way how to handle masks
 
+            self._run_fold(train_X, train_y, test_X, test_y)
             fold += 1
 
         if self.measure_performance:
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 030f04aa431..b84d86d94dd 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -20,6 +20,7 @@
 # -------------------------------------------------------------
 import pickle
 import time
+import copy
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, field, asdict
 
@@ -28,11 +29,17 @@
 
 import numpy as np
 from systemds.scuro.representations.window_aggregation import WindowAggregation
+from systemds.scuro.representations.concatenation import Concatenation
+from systemds.scuro.representations.hadamard import Hadamard
+from systemds.scuro.representations.sum import Sum
 
 from systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
-from systemds.scuro import ModalityType, Aggregation
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.aggregate import Aggregation
 from systemds.scuro.drsearch.operator_registry import Registry
 from systemds.scuro.utils.schema_helpers import get_shape
 
@@ -84,7 +91,6 @@ def optimize_parallel(self, n_workers=None):
     def optimize(self):
         for modality in self.modalities:
             local_result = self._process_modality(modality, False)
-            # self._merge_results(local_result)
 
     def _process_modality(self, modality, parallel):
         if parallel:
@@ -95,43 +101,59 @@ def _process_modality(self, modality, parallel):
             local_results = self.operator_performance
 
         context_operators = self.operator_registry.get_context_operators()
-
-        for context_operator in context_operators:
-            context_representation = None
-            if (
-                modality.modality_type != ModalityType.TEXT
-                and modality.modality_type != ModalityType.VIDEO
-            ):
-                con_op = context_operator()
-                context_representation = modality.context(con_op)
-                self._evaluate_local(context_representation, [con_op], local_results)
-
-            modality_specific_operators = self.operator_registry.get_representations(
+        not_self_contained_reps = (
+            self.operator_registry.get_not_self_contained_representations(
                 modality.modality_type
             )
-            for modality_specific_operator in modality_specific_operators:
-                mod_context = None
-                mod_op = modality_specific_operator()
-                if context_representation is not None:
-                    mod_context = context_representation.apply_representation(mod_op)
-                    self._evaluate_local(mod_context, [con_op, mod_op], local_results)
-
-                mod = modality.apply_representation(mod_op)
-                self._evaluate_local(mod, [mod_op], local_results)
-
-                for context_operator_after in context_operators:
-                    con_op_after = context_operator_after()
-                    if mod_context is not None:
-                        mod_context = mod_context.context(con_op_after)
-                        self._evaluate_local(
-                            mod_context, [con_op, mod_op, con_op_after], local_results
-                        )
-
-                    mod = mod.context(con_op_after)
-                    self._evaluate_local(mod, [mod_op, con_op_after], local_results)
+        )
+        modality_specific_operators = self.operator_registry.get_representations(
+            modality.modality_type
+        )
+        for modality_specific_operator in modality_specific_operators:
+            mod_op = modality_specific_operator()
+
+            mod = modality.apply_representation(mod_op)
+            self._evaluate_local(mod, [mod_op], local_results)
+
+            if not mod_op.self_contained:
+                self._combine_non_self_contained_representations(
+                    modality, mod, not_self_contained_reps, local_results
+                )
+
+            for context_operator_after in context_operators:
+                con_op_after = context_operator_after()
+                mod = mod.context(con_op_after)
+                self._evaluate_local(mod, [mod_op, con_op_after], local_results)
 
             return local_results
 
+    def _combine_non_self_contained_representations(
+        self,
+        modality: Modality,
+        representation: TransformedModality,
+        other_representations,
+        local_results,
+    ):
+        combined = representation
+        context_operators = self.operator_registry.get_context_operators()
+        used_representations = representation.transformation
+        for other_representation in other_representations:
+            used_representations.append(other_representation())
+            for combination in [Concatenation(), Hadamard(), Sum()]:
+                combined = combined.combine(
+                    modality.apply_representation(other_representation()), combination
+                )
+                self._evaluate_local(
+                    combined, used_representations, local_results, combination
+                )
+
+                for context_op in context_operators:
+                    con_op = context_op()
+                    mod = combined.context(con_op)
+                    c_t = copy.deepcopy(used_representations)
+                    c_t.append(con_op)
+                    self._evaluate_local(mod, c_t, local_results, combination)
+
     def _merge_results(self, local_results):
         """Merge local results into the main results"""
         for modality_id in local_results.results:
@@ -145,7 +167,9 @@ def _merge_results(self, local_results):
                 for key, value in local_results.cache[modality][task_name].items():
                     self.operator_performance.cache[modality][task_name][key] = value
 
-    def _evaluate_local(self, modality, representations, local_results):
+    def _evaluate_local(
+        self, modality, representations, local_results, combination=None
+    ):
         if self._tasks_require_same_dims:
             if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1:
                 # for aggregation in Aggregation().get_aggregation_functions():
@@ -165,6 +189,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
             else:
                 modality.pad()
@@ -178,6 +203,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
         else:
             for task in self.tasks:
@@ -198,6 +224,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
                 else:
                     # modality.pad()
@@ -210,6 +237,7 @@ def _evaluate_local(self, modality, representations, local_results):
                         modality,
                         task.model.name,
                         end - start,
+                        combination,
                     )
 
 
@@ -228,7 +256,9 @@ def __init__(self, modalities, tasks, debug=False):
                 self.cache[modality][task_name] = {}
                 self.results[modality][task_name] = []
 
-    def add_result(self, scores, representations, modality, task_name, task_time):
+    def add_result(
+        self, scores, representations, modality, task_name, task_time, combination
+    ):
         parameters = []
         representation_names = []
 
@@ -256,6 +286,7 @@ def add_result(self, scores, representations, modality, task_name, task_time):
             val_score=scores[1],
             representation_time=modality.transform_time,
             task_time=task_time,
+            combination=combination.name if combination else "",
         )
         self.results[modality.modality_id][task_name].append(entry)
         self.cache[modality.modality_id][task_name][
@@ -302,3 +333,4 @@ class ResultEntry:
     train_score: float
     representation_time: float
     task_time: float
+    combination: str
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index 94e745b2cc1..f1b00fefcfe 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -168,6 +168,6 @@ def is_aligned(self, other_modality):
                 != list(other_modality.metadata.values())[i]["data_layout"]["shape"]
             ):
                 aligned = False
-                continue
+                break
 
         return aligned
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 6523e9502fc..9481937e2ca 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -32,7 +32,9 @@
 
 class TransformedModality(Modality):
 
-    def __init__(self, modality, transformation, new_modality_type=None):
+    def __init__(
+        self, modality, transformation, new_modality_type=None, self_contained=True
+    ):
         """
         Parent class of the different Modalities (unimodal & multimodal)
         :param modality_type: Type of the original modality(ies)
@@ -46,8 +48,18 @@ def __init__(self, modality, transformation, new_modality_type=None):
             new_modality_type, modality.modality_id, metadata, modality.data_type
         )
         self.transformation = None
+        self.self_contained = (
+            self_contained and transformation.self_contained
+            if isinstance(transformation, TransformedModality)
+            else True
+        )
         self.add_transformation(transformation, modality)
 
+        if modality.__class__.__name__ == "UnimodalModality":
+            for k, v in self.metadata.items():
+                if "attention_masks" in v:
+                    del self.metadata[k]["attention_masks"]
+
     def add_transformation(self, transformation, modality):
         if (
             transformation.__class__.__bases__[0].__name__ == "Fusion"
@@ -89,14 +101,18 @@ def join(self, right, join_condition):
 
     def window_aggregation(self, windowSize, aggregation):
         w = WindowAggregation(windowSize, aggregation)
-        transformed_modality = TransformedModality(self, w)
+        transformed_modality = TransformedModality(
+            self, w, self_contained=self.self_contained
+        )
         start = time.time()
         transformed_modality.data = w.execute(self)
         transformed_modality.transform_time = time.time() - start
         return transformed_modality
 
     def context(self, context_operator):
-        transformed_modality = TransformedModality(self, context_operator)
+        transformed_modality = TransformedModality(
+            self, context_operator, self_contained=self.self_contained
+        )
         start = time.time()
         transformed_modality.data = context_operator.execute(self)
         transformed_modality.transform_time = time.time() - start
@@ -107,6 +123,7 @@ def apply_representation(self, representation):
         new_modality = representation.transform(self)
         new_modality.update_metadata()
         new_modality.transform_time = time.time() - start
+        new_modality.self_contained = representation.self_contained
         return new_modality
 
     def combine(self, other: Union[Modality, List[Modality]], fusion_method):
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 94d1fa057d9..dd1674ea85a 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -110,6 +110,9 @@ def apply_representation(self, representation):
             self,
             representation,
         )
+
+        pad_dim_one = False
+
         new_modality.data = []
         start = time.time()
         original_lengths = []
@@ -131,26 +134,39 @@ def apply_representation(self, representation):
                 "attention_masks" in entry for entry in new_modality.metadata.values()
             ):
                 for d in new_modality.data:
-                    original_lengths.append(d.shape[0])
+                    if d.shape[0] == 1 and d.ndim == 2:
+                        pad_dim_one = True
+                        original_lengths.append(d.shape[1])
+                    else:
+                        original_lengths.append(d.shape[0])
+
+        new_modality.data = self.l2_normalize_features(new_modality.data)
 
         if len(original_lengths) > 0 and min(original_lengths) < max(original_lengths):
             target_length = max(original_lengths)
             padded_embeddings = []
             for embeddings in new_modality.data:
-                current_length = embeddings.shape[0]
+                current_length = (
+                    embeddings.shape[0] if not pad_dim_one else embeddings.shape[1]
+                )
                 if current_length < target_length:
                     padding_needed = target_length - current_length
-
-                    padded = np.pad(
-                        embeddings,
-                        pad_width=(
-                            (0, padding_needed),
-                            (0, 0),
-                        ),
-                        mode="constant",
-                        constant_values=0,
-                    )
-                    padded_embeddings.append(padded)
+                    if pad_dim_one:
+                        padding = np.zeros((embeddings.shape[0], padding_needed))
+                        padded_embeddings.append(
+                            np.concatenate((embeddings, padding), axis=1)
+                        )
+                    else:
+                        padded = np.pad(
+                            embeddings,
+                            pad_width=(
+                                (0, padding_needed),
+                                (0, 0),
+                            ),
+                            mode="constant",
+                            constant_values=0,
+                        )
+                        padded_embeddings.append(padded)
                 else:
                     padded_embeddings.append(embeddings)
 
@@ -164,4 +180,22 @@ def apply_representation(self, representation):
             new_modality.data = padded_embeddings
         new_modality.update_metadata()
         new_modality.transform_time = time.time() - start
+        new_modality.self_contained = representation.self_contained
         return new_modality
+
+    def l2_normalize_features(self, feature_list):
+        normalized_features = []
+        for feature in feature_list:
+            original_shape = feature.shape
+            flattened = feature.flatten()
+
+            norm = np.linalg.norm(flattened)
+            if norm > 0:
+                normalized_flat = flattened / norm
+                normalized_feature = normalized_flat.reshape(original_shape)
+            else:
+                normalized_feature = feature
+
+            normalized_features.append(normalized_feature)
+
+        return normalized_features
diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py
index 9412c5be008..9119070a027 100644
--- a/src/main/python/systemds/scuro/representations/aggregated_representation.py
+++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py
@@ -26,8 +26,11 @@ class AggregatedRepresentation(Representation):
     def __init__(self, aggregation):
         super().__init__("AggregatedRepresentation", aggregation.parameters)
         self.aggregation = aggregation
+        self.self_contained = True
 
     def transform(self, modality):
-        aggregated_modality = TransformedModality(modality, self)
+        aggregated_modality = TransformedModality(
+            modality, self, self_contained=modality.self_contained
+        )
         aggregated_modality.data = self.aggregation.execute(modality)
         return aggregated_modality
diff --git a/src/main/python/systemds/scuro/representations/covarep_audio_features.py b/src/main/python/systemds/scuro/representations/covarep_audio_features.py
new file mode 100644
index 00000000000..3b4398cb11b
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/covarep_audio_features.py
@@ -0,0 +1,156 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import librosa
+import numpy as np
+
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.modality.transformed import TransformedModality
+
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.drsearch.operator_registry import register_representation
+
+
+@register_representation(ModalityType.AUDIO)
+class Spectral(UnimodalRepresentation):
+    def __init__(self, hop_length=512):
+        parameters = {
+            "hop_length": [256, 512, 1024, 2048],
+        }  # TODO
+        super().__init__("Spectral", ModalityType.EMBEDDING, parameters, False)
+        self.hop_length = hop_length
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        result = []
+        for i, y in enumerate(modality.data):
+            sr = list(modality.metadata.values())[i]["frequency"]
+
+            spectral_centroid = librosa.feature.spectral_centroid(
+                y=y, sr=sr, hop_length=self.hop_length
+            )
+            spectral_bandwidth = librosa.feature.spectral_bandwidth(
+                y=y, sr=sr, hop_length=self.hop_length
+            )
+            spectral_rolloff = librosa.feature.spectral_rolloff(
+                y=y, sr=sr, hop_length=self.hop_length
+            )
+            spectral_flatness = librosa.feature.spectral_flatness(
+                y=y, hop_length=self.hop_length
+            )
+
+            features = np.vstack(
+                [
+                    spectral_centroid,
+                    spectral_bandwidth,
+                    spectral_rolloff,
+                    spectral_flatness,
+                ]
+            )
+
+            result.append(features.T)
+
+        transformed_modality.data = result
+
+        return transformed_modality
+
+
+@register_representation(ModalityType.AUDIO)
+class ZeroCrossing(UnimodalRepresentation):
+    def __init__(self, hop_length=512):
+        parameters = {
+            "hop_length": [256, 512, 1024, 2048],
+        }  # TODO
+        super().__init__("ZeroCrossing", ModalityType.EMBEDDING, parameters, False)
+        self.hop_length = hop_length
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        result = []
+        for i, y in enumerate(modality.data):
+            zero_crossing_rate = librosa.feature.zero_crossing_rate(
+                y, hop_length=self.hop_length
+            )
+
+            result.append(zero_crossing_rate)
+
+        transformed_modality.data = result
+
+        return transformed_modality
+
+
+@register_representation(ModalityType.AUDIO)
+class RMSE(UnimodalRepresentation):
+    def __init__(self, frame_length=1024, hop_length=512):
+        parameters = {
+            "frame_length": [1024, 2048, 4096],
+            "hop_length": [256, 512, 1024, 2048],
+        }  # TODO
+        super().__init__("RMSE", ModalityType.EMBEDDING, parameters, False)
+        self.hop_length = hop_length
+        self.frame_length = frame_length
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        result = []
+        for i, y in enumerate(modality.data):
+            rmse = librosa.feature.rms(
+                y=y, frame_length=self.frame_length, hop_length=self.hop_length
+            )
+            result.append(rmse)
+
+        transformed_modality.data = result
+
+        return transformed_modality
+
+
+@register_representation(ModalityType.AUDIO)
+class Pitch(UnimodalRepresentation):
+    def __init__(self, hop_length=512):
+        parameters = {
+            "hop_length": [256, 512, 1024, 2048],
+        }  # TODO
+        super().__init__("Pitch", ModalityType.EMBEDDING, parameters, False)
+        self.hop_length = hop_length
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        result = []
+        for i, y in enumerate(modality.data):
+            sr = list(modality.metadata.values())[i]["frequency"]
+
+            pitches, magnitudes = librosa.piptrack(
+                y=y, sr=sr, hop_length=self.hop_length
+            )
+            pitch = pitches[magnitudes.argmax(axis=0), np.arange(magnitudes.shape[1])]
+
+            result.append(pitch[np.newaxis, :])
+
+        transformed_modality.data = result
+
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
index 4b746eee219..61988abba26 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -38,6 +38,8 @@ def __init__(self, name, parameters=None):
         self.associative = False
         self.commutative = False
         self.needs_alignment = False
+        self.needs_training = False
+        self.needs_instance_alignment = False
 
     def transform(self, modalities: List[Modality]):
         """
@@ -58,8 +60,27 @@ def transform(self, modalities: List[Modality]):
             max_len = self.get_max_embedding_size(mods)
             for modality in mods:
                 modality.pad(max_len=max_len)
+
         return self.execute(mods)
 
+    def transform_with_training(
+        self, modalities: List[Modality], train_indices, labels
+    ):
+        # if self.needs_instance_alignment:
+        #     max_len = self.get_max_embedding_size(modalities)
+        #     for modality in modalities:
+        #         modality.pad(max_len=max_len)
+
+        self.execute(
+            [np.array(modality.data)[train_indices] for modality in modalities],
+            labels[train_indices],
+        )
+
+    def transform_data(self, modalities: List[Modality], val_indices):
+        return self.apply_representation(
+            [np.array(modality.data)[val_indices] for modality in modalities]
+        )
+
     def execute(self, modalities: List[Modality]):
         raise f"Not implemented for Fusion: {self.name}"
 
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 8e897542b0c..dca1b0eec85 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -36,7 +36,7 @@ def __init__(self, n_mels=128, hop_length=512, n_fft=2048):
             "hop_length": [256, 512, 1024, 2048],
             "n_fft": [1024, 2048, 4096],
         }
-        super().__init__("MelSpectrogram", ModalityType.TIMESERIES, parameters)
+        super().__init__("MelSpectrogram", ModalityType.TIMESERIES, parameters, False)
         self.n_mels = n_mels
         self.hop_length = hop_length
         self.n_fft = n_fft
@@ -56,9 +56,8 @@ def transform(self, modality):
                 hop_length=self.hop_length,
                 n_fft=self.n_fft,
             ).astype(modality.data_type)
-            S_dB = librosa.power_to_db(S, ref=np.max)
 
-            result.append(S_dB.T)
+            result.append(S.T)
 
         transformed_modality.data = result
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py
index 00f735a756e..c942f3076e7 100644
--- a/src/main/python/systemds/scuro/representations/mfcc.py
+++ b/src/main/python/systemds/scuro/representations/mfcc.py
@@ -37,7 +37,7 @@ def __init__(self, n_mfcc=12, dct_type=2, n_mels=128, hop_length=512):
             "hop_length": [256, 512, 1024, 2048],
             "n_mels": [20, 32, 64, 128],
         }  # TODO
-        super().__init__("MFCC", ModalityType.TIMESERIES, parameters)
+        super().__init__("MFCC", ModalityType.TIMESERIES, parameters, False)
         self.n_mfcc = n_mfcc
         self.dct_type = dct_type
         self.n_mels = n_mels
diff --git a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
new file mode 100644
index 00000000000..7928b9988bd
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
@@ -0,0 +1,365 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Dict, Optional
+import numpy as np
+from systemds.scuro.drsearch.operator_registry import register_fusion_operator
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.representations.fusion import Fusion
+from systemds.scuro.utils.static_variables import get_device
+
+
+@register_fusion_operator()
+class AttentionFusion(Fusion):
+    def __init__(
+        self,
+        hidden_dim=256,
+        num_heads=8,
+        dropout=0.1,
+        fusion_strategy="attention",
+        batch_size=32,
+        num_epochs=50,
+    ):
+        self.encoder = None
+        params = {
+            "hidden_dim": [128, 256, 512],
+            "num_heads": [1, 4, 8],
+            "dropout": [0.1, 0.2, 0.3],
+            "fusion_strategy": ["mean", "max", "attention", "cls"],
+            "batch_size": [32, 64, 128],
+            "num_epochs": [50, 70, 100, 150],
+        }
+        super().__init__("AttentionFusion", params)
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.fusion_strategy = fusion_strategy
+        self.batch_size = batch_size
+        self.needs_training = True
+        self.needs_instance_alignment = True
+        self.num_epochs = num_epochs
+
+    def execute(
+        self,
+        data: List[np.ndarray],
+        labels: np.ndarray,
+    ):
+        input_dimension = {}
+        inputs = {}
+        max_sequence_length = 0
+        masks = {}
+        for i, modality in enumerate(data):
+            modality_name = "modality_" + str(i)
+            shape = modality.shape
+            max_sequence_length = max(max_sequence_length, shape[1])
+            input_dimension[modality_name] = shape[2] if len(shape) > 2 else shape[1]
+            inputs[modality_name] = torch.from_numpy(np.stack(modality)).to(
+                get_device()
+            )
+
+            # attention_masks_list = [
+            #     entry["attention_masks"]
+            #     for entry in modality.metadata.values()
+            #     if "attention_masks" in entry
+            # ]
+            attention_masks_list = None
+            if attention_masks_list:
+                masks[modality_name] = (
+                    torch.tensor(np.array(attention_masks_list)).bool().to(get_device())
+                )
+            else:
+                masks[modality_name] = None
+
+        self.encoder = MultiModalAttentionFusion(
+            input_dimension,
+            self.hidden_dim,
+            self.num_heads,
+            self.dropout,
+            max_sequence_length,
+            self.fusion_strategy,
+        )
+
+        head = FusedClassificationHead(
+            fused_dim=self.hidden_dim, num_classes=len(np.unique(labels))
+        )
+        criterion = nn.CrossEntropyLoss()
+        optimizer = torch.optim.Adam(
+            list(self.encoder.parameters()) + list(head.parameters()), lr=0.001
+        )
+        labels = torch.from_numpy(labels).to(get_device())
+
+        for epoch in range(self.num_epochs):
+            total_loss = 0
+            total_accuracy = 0
+            for batch_idx in range(0, len(data), self.batch_size):
+                batched_input = {}
+                for modality, modality_data in inputs.items():
+                    batched_input[modality] = modality_data[
+                        batch_idx : batch_idx + self.batch_size
+                    ]
+                loss, predictions = self.train_encoder_step(
+                    head,
+                    inputs,
+                    labels[batch_idx : batch_idx + self.batch_size],
+                    criterion,
+                    optimizer,
+                )
+                total_loss += loss
+                total_accuracy += predictions
+
+            if epoch % 50 == 0 or epoch == self.num_epochs - 1:
+                print(
+                    f"Epoch {epoch}, Loss: {total_loss:.4f}, accuracy: {total_accuracy/len(data):.4f}"
+                )
+
+    # Training step (encoder + classification head)
+    def train_encoder_step(self, head, inputs, labels, criterion, optimizer):
+        self.encoder.train()
+        head.train()
+        optimizer.zero_grad()
+        output = self.encoder(inputs)
+        logits = head(output["fused"])
+        loss = criterion(logits, labels)
+        loss.backward()
+        optimizer.step()
+        _, predicted = torch.max(logits.data, 1)
+        return loss.item(), (predicted == labels).sum().item()
+
+    def apply_representation(self, modalities):
+        inputs = {}
+        for i, modality in enumerate(modalities):
+            modality_name = "modality_" + str(i)
+            inputs[modality_name] = torch.from_numpy(np.stack(modality)).to(
+                get_device()
+            )
+        self.encoder.eval()
+        with torch.no_grad():
+            output = self.encoder(inputs)
+        return output["fused"].cpu().numpy()
+
+
+class FusedClassificationHead(nn.Module):
+    """
+    Simple classification head for supervision during training.
+    """
+
+    def __init__(self, fused_dim, num_classes=2):
+        super(FusedClassificationHead, self).__init__()
+        self.head = nn.Sequential(
+            nn.Linear(fused_dim, fused_dim // 2),
+            nn.ReLU(),
+            nn.Linear(fused_dim // 2, num_classes),
+        ).to(get_device())
+
+    def forward(self, fused):
+        return self.head(fused)
+
+
+class MultiModalAttentionFusion(nn.Module):
+    def __init__(
+        self,
+        modality_dims: Dict[str, int],
+        hidden_dim: int,
+        num_heads: int,
+        dropout: float,
+        max_seq_len: int,
+        pooling_strategy: str,
+    ):
+        super().__init__()
+
+        self.modality_dims = modality_dims
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.pooling_strategy = pooling_strategy
+        self.max_seq_len = max_seq_len
+
+        # Project each modality to the same hidden dimension
+        self.modality_projections = nn.ModuleDict(
+            {
+                modality: nn.Linear(dim, hidden_dim).to(get_device())
+                for modality, dim in modality_dims.items()
+            }
+        )
+
+        # Positional encoding for sequence modalities
+        self.positional_encoding = nn.Parameter(
+            torch.randn(max_seq_len, hidden_dim) * 0.1
+        ).to(get_device())
+
+        # Cross-modal attention
+        self.cross_attention = nn.MultiheadAttention(
+            embed_dim=hidden_dim, num_heads=num_heads, dropout=dropout, batch_first=True
+        ).to(get_device())
+
+        # Self-attention within each modality
+        self.self_attention = nn.MultiheadAttention(
+            embed_dim=hidden_dim, num_heads=num_heads, dropout=dropout, batch_first=True
+        ).to(get_device())
+
+        # Attention-based pooling for sequences
+        if pooling_strategy == "attention":
+            self.pooling_attention = nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim // 2),
+                nn.Tanh(),
+                nn.Linear(hidden_dim // 2, 1),
+            ).to(get_device())
+
+        # Modality-level attention for final fusion
+        self.modality_attention = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.ReLU(),
+            nn.Linear(hidden_dim // 2, 1),
+        ).to(get_device())
+
+        # Layer normalization
+        self.layer_norm = nn.LayerNorm(hidden_dim).to(get_device())
+        self.dropout = nn.Dropout(dropout).to(get_device())
+
+        # Final projection
+        self.final_projection = nn.Linear(hidden_dim, hidden_dim).to(get_device())
+
+    def _handle_input_format(self, modality_tensor: torch.Tensor) -> torch.Tensor:
+        if len(modality_tensor.shape) == 2:
+            modality_tensor = modality_tensor.unsqueeze(1)
+        elif len(modality_tensor.shape) == 3:
+            pass
+        else:
+            raise ValueError(
+                f"Input tensor must be 2D or 3D, got {len(modality_tensor.shape)}D"
+            )
+
+        if modality_tensor.dtype != torch.float:
+            modality_tensor = modality_tensor.float()
+
+        return modality_tensor
+
+    def _pool_sequence(
+        self, sequence: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if self.pooling_strategy == "mean":
+            if mask is not None:
+                # Masked mean pooling
+                masked_seq = sequence * mask.unsqueeze(-1)
+                return masked_seq.sum(dim=1) / mask.sum(dim=1, keepdim=True).clamp(
+                    min=1
+                )
+            else:
+                return sequence.mean(dim=1)
+
+        elif self.pooling_strategy == "max":
+            if mask is not None:
+                # Set masked positions to large negative value before max pooling
+                masked_seq = sequence.masked_fill(~mask.unsqueeze(-1), float("-inf"))
+                return masked_seq.max(dim=1)[0]
+            else:
+                return sequence.max(dim=1)[0]
+
+        elif self.pooling_strategy == "cls":
+            # Use the first token (assuming it's a CLS token)
+            return sequence[:, 0, :]
+
+        elif self.pooling_strategy == "attention":
+            # Attention-based pooling
+            attention_scores = self.pooling_attention(sequence).squeeze(
+                -1
+            )  # (batch, seq)
+
+            if mask is not None:
+                attention_scores = attention_scores.masked_fill(~mask, float("-inf"))
+
+            attention_weights = F.softmax(attention_scores, dim=1)  # (batch, seq)
+            return (sequence * attention_weights.unsqueeze(-1)).sum(
+                dim=1
+            )  # (batch, hidden)
+
+        else:
+            raise ValueError(f"Unknown pooling strategy: {self.pooling_strategy}")
+
+    def forward(
+        self,
+        modality_inputs: Dict[str, torch.Tensor],
+        modality_masks: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> Dict[str, torch.Tensor]:
+        modality_embeddings = {}
+
+        for modality, input_tensor in modality_inputs.items():
+            normalized_input = self._handle_input_format(input_tensor)
+            seq_len = normalized_input.size(1)
+
+            projected = self.modality_projections[modality](normalized_input)
+
+            if seq_len > 1:
+                pos_encoding = self.positional_encoding[:seq_len].unsqueeze(0)
+                projected = projected + pos_encoding
+
+            if seq_len > 1:
+                mask = modality_masks.get(modality) if modality_masks else None
+
+                attended, _ = self.self_attention(
+                    query=projected,
+                    key=projected,
+                    value=projected,
+                    key_padding_mask=~mask if mask is not None else None,
+                )
+
+                projected = self.layer_norm(projected + self.dropout(attended))
+
+                pooled = self._pool_sequence(projected, mask)
+            else:
+                pooled = projected.squeeze(1)
+
+            modality_embeddings[modality] = pooled
+
+        if len(modality_embeddings) > 1:
+            modality_stack = torch.stack(list(modality_embeddings.values()), dim=1)
+
+            cross_attended, cross_attention_weights = self.cross_attention(
+                query=modality_stack, key=modality_stack, value=modality_stack
+            )
+
+            cross_attended = self.layer_norm(
+                modality_stack + self.dropout(cross_attended)
+            )
+
+            updated_embeddings = {
+                modality: cross_attended[:, i, :]
+                for i, modality in enumerate(modality_embeddings.keys())
+            }
+            modality_embeddings = updated_embeddings
+
+        modality_stack = torch.stack(list(modality_embeddings.values()), dim=1)
+        modality_scores = self.modality_attention(modality_stack).squeeze(-1)
+        modality_weights = F.softmax(modality_scores, dim=1)
+
+        fused_representation = (modality_stack * modality_weights.unsqueeze(-1)).sum(
+            dim=1
+        )
+
+        output = self.final_projection(fused_representation)
+
+        return {
+            "fused": output,
+            "modality_embeddings": modality_embeddings,
+            "attention_weights": modality_weights,
+        }
diff --git a/src/main/python/systemds/scuro/representations/representation.py b/src/main/python/systemds/scuro/representations/representation.py
index 6137baf46dc..144b88f34c0 100644
--- a/src/main/python/systemds/scuro/representations/representation.py
+++ b/src/main/python/systemds/scuro/representations/representation.py
@@ -25,6 +25,7 @@ class Representation:
     def __init__(self, name, parameters):
         self.name = name
         self._parameters = parameters
+        self.self_contained = True
 
     @property
     def parameters(self):
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index bdfbfb17fc0..f961cb4588a 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -29,13 +29,7 @@
 import torchvision.models as models
 import numpy as np
 from systemds.scuro.modality.type import ModalityType
-
-if torch.backends.mps.is_available():
-    DEVICE = torch.device("mps")
-elif torch.cuda.is_available():
-    DEVICE = torch.device("cuda")
-else:
-    DEVICE = torch.device("cpu")
+from systemds.scuro.utils.static_variables import get_device
 
 
 @register_representation(
@@ -72,33 +66,33 @@ def model_name(self, model_name):
         if model_name == "ResNet18":
             self.model = (
                 models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
-                .to(DEVICE)
+                .to(get_device())
                 .to(self.data_type)
             )
 
         elif model_name == "ResNet34":
             self.model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to(
-                DEVICE
+                get_device()
             )
             self.model = self.model.to(self.data_type)
         elif model_name == "ResNet50":
             self.model = (
                 models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
-                .to(DEVICE)
+                .to(get_device())
                 .to(self.data_type)
             )
 
         elif model_name == "ResNet101":
             self.model = (
                 models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
-                .to(DEVICE)
+                .to(get_device())
                 .to(self.data_type)
             )
 
         elif model_name == "ResNet152":
             self.model = (
                 models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
-                .to(DEVICE)
+                .to(get_device())
                 .to(self.data_type)
             )
 
@@ -129,7 +123,7 @@ def transform(self, modality):
         if next(self.model.parameters()).dtype != self.data_type:
             self.model = self.model.to(self.data_type)
 
-        dataset = CustomDataset(modality.data, self.data_type, DEVICE)
+        dataset = CustomDataset(modality.data, self.data_type, get_device())
         embeddings = {}
 
         res5c_output = None
diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py
index 8daa9abb015..51b69d7d87c 100644
--- a/src/main/python/systemds/scuro/representations/spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/spectrogram.py
@@ -32,7 +32,7 @@
 class Spectrogram(UnimodalRepresentation):
     def __init__(self, hop_length=512, n_fft=2048):
         parameters = {"hop_length": [256, 512, 1024, 2048], "n_fft": [1024, 2048, 4096]}
-        super().__init__("Spectrogram", ModalityType.TIMESERIES, parameters)
+        super().__init__("Spectrogram", ModalityType.TIMESERIES, parameters, False)
         self.hop_length = hop_length
         self.n_fft = n_fft
 
@@ -48,7 +48,7 @@ def transform(self, modality):
             ).astype(modality.data_type)
             S_dB = librosa.amplitude_to_db(np.abs(spectrogram))
 
-            result.append(S_dB.T.reshape(-1))
+            result.append(S_dB.T)
 
         transformed_modality.data = result
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/swin_video_transformer.py b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
index 19b2fd05c4f..c0b7ab38ab0 100644
--- a/src/main/python/systemds/scuro/representations/swin_video_transformer.py
+++ b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
@@ -18,7 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
-# from torchvision.models.video.swin_transformer import swin3d_t
+from torchvision.models.video.swin_transformer import swin3d_t
 
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -31,13 +31,7 @@
 from systemds.scuro.drsearch.operator_registry import register_representation
 
 from systemds.scuro.utils.torch_dataset import CustomDataset
-
-if torch.backends.mps.is_available():
-    DEVICE = torch.device("mps")
-# elif torch.cuda.is_available():
-#     DEVICE = torch.device("cuda")
-else:
-    DEVICE = torch.device("cpu")
+from systemds.scuro.utils.static_variables import get_device
 
 
 # @register_representation([ModalityType.VIDEO])
@@ -55,16 +49,17 @@ def __init__(self, layer_name="avgpool"):
                 "avgpool",
             ],
         }
+        self.data_type = torch.float
         super().__init__("SwinVideoTransformer", ModalityType.TIMESERIES, parameters)
         self.layer_name = layer_name
-        # self.model = swin3d_t(weights=models.video.Swin3D_T_Weights).to(DEVICE)
+        self.model = swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1).to(
+            get_device()
+        )
         self.model.eval()
         for param in self.model.parameters():
             param.requires_grad = False
 
     def transform(self, modality):
-        # model = swin3d_t(weights=models.video.Swin3D_T_Weights)
-
         embeddings = {}
         swin_output = None
 
@@ -82,11 +77,11 @@ def hook(
                 if name == self.layer_name:
                     layer.register_forward_hook(get_features(name))
                     break
-        dataset = CustomDataset(modality.data)
+        dataset = CustomDataset(modality.data, self.data_type, get_device())
 
-        for instance in dataset:
-            video_id = instance["id"]
-            frames = instance["data"].to(DEVICE)
+        for instance in torch.utils.data.DataLoader(dataset):
+            video_id = instance["id"][0]
+            frames = instance["data"][0]
             embeddings[video_id] = []
 
             frames = frames.unsqueeze(0).permute(0, 2, 1, 3, 4)
@@ -95,15 +90,18 @@ def hook(
             values = swin_output
             pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
 
-            embeddings[video_id].extend(torch.flatten(pooled, 1).detach().cpu().numpy())
+            embeddings[video_id].extend(
+                torch.flatten(pooled, 1)
+                .detach()
+                .cpu()
+                .numpy()
+                .astype(modality.data_type)
+            )
 
             embeddings[video_id] = np.array(embeddings[video_id])
 
         transformed_modality = TransformedModality(
-            self.output_modality_type,
-            "swinVideoTransformer",
-            modality.modality_id,
-            modality.metadata,
+            modality, self, self.output_modality_type
         )
 
         transformed_modality.data = list(embeddings.values())
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py b/src/main/python/systemds/scuro/representations/unimodal.py
index 559eec1401c..362888aa278 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -24,7 +24,9 @@
 
 
 class UnimodalRepresentation(Representation):
-    def __init__(self, name: str, output_modality_type, parameters=None):
+    def __init__(
+        self, name: str, output_modality_type, parameters=None, self_contained=True
+    ):
         """
         Parent class for all unimodal representation types
         :param name: name of the representation
@@ -35,6 +37,7 @@ def __init__(self, name: str, output_modality_type, parameters=None):
         self.output_modality_type = output_modality_type
         if parameters is None:
             parameters = {}
+        self.self_contained = self_contained
 
     @abc.abstractmethod
     def transform(self, data):
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 88d60ac828b..06e082fb695 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -41,7 +41,7 @@ def get_embedding(sentence, model):
 
 @register_representation(ModalityType.TEXT)
 class W2V(UnimodalRepresentation):
-    def __init__(self, vector_size=3, min_count=2, window=2, output_file=None):
+    def __init__(self, vector_size=150, min_count=2, window=5, output_file=None):
         parameters = {
             "vector_size": [vector_size],
             "min_count": [min_count],
diff --git a/src/main/python/systemds/scuro/utils/static_variables.py b/src/main/python/systemds/scuro/utils/static_variables.py
index 8237cdf1b3e..807287cd95a 100644
--- a/src/main/python/systemds/scuro/utils/static_variables.py
+++ b/src/main/python/systemds/scuro/utils/static_variables.py
@@ -32,5 +32,6 @@ def get_device():
     return torch.device(
         "cuda:0"
         if torch.cuda.is_available()
-        else "mps" if torch.mps.is_available() else "cpu"
+        # else "mps" if torch.mps.is_available()
+        else "cpu"
     )
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index e57716fa99d..4dcfa5a89ce 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -95,10 +95,16 @@ def create1DModality(
 
     def create_audio_data(self, num_instances, max_audio_length):
         data = [
-            [random.random() for _ in range(random.randint(1, max_audio_length))]
+            [
+                random.random()
+                for _ in range(random.randint(max_audio_length * 0.9, max_audio_length))
+            ]
             for _ in range(num_instances)
         ]
 
+        for i in range(num_instances):
+            data[i] = np.array(data[i]).astype(self.data_type)
+
         metadata = {
             i: ModalityType.AUDIO.create_audio_metadata(16000, np.array(data[i]))
             for i in range(num_instances)
diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py
index 7f2a752722a..a6941fe618c 100644
--- a/src/main/python/tests/scuro/test_operator_registry.py
+++ b/src/main/python/tests/scuro/test_operator_registry.py
@@ -21,7 +21,14 @@
 
 import unittest
 
+from systemds.scuro.representations.covarep_audio_features import (
+    ZeroCrossing,
+    Spectral,
+    Pitch,
+    RMSE,
+)
 from systemds.scuro.representations.mfcc import MFCC
+from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer
 from systemds.scuro.representations.wav2vec import Wav2Vec
 from systemds.scuro.representations.window_aggregation import WindowAggregation
 from systemds.scuro.representations.bow import BoW
@@ -39,19 +46,29 @@
 from systemds.scuro.representations.hadamard import Hadamard
 from systemds.scuro.representations.resnet import ResNet
 from systemds.scuro.representations.sum import Sum
+from systemds.scuro.representations.multimodal_attention_fusion import AttentionFusion
 
 
 class TestOperatorRegistry(unittest.TestCase):
     def test_audio_representations_in_registry(self):
         registry = Registry()
-        for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]:
-            assert representation in registry.get_representations(
-                ModalityType.AUDIO
-            ), f"{representation} not in registry"
+        assert registry.get_representations(ModalityType.AUDIO) == [
+            MelSpectrogram,
+            MFCC,
+            Spectrogram,
+            Wav2Vec,
+            Spectral,
+            ZeroCrossing,
+            RMSE,
+            Pitch,
+        ]
 
     def test_video_representations_in_registry(self):
         registry = Registry()
-        assert registry.get_representations(ModalityType.VIDEO) == [ResNet]
+        assert registry.get_representations(ModalityType.VIDEO) == [
+            ResNet,
+            # SwinVideoTransformer,
+        ]
 
     def test_timeseries_representations_in_registry(self):
         registry = Registry()
@@ -70,17 +87,15 @@ def test_context_operator_in_registry(self):
 
     # def test_fusion_operator_in_registry(self):
     #     registry = Registry()
-    #     for fusion_operator in [
-    #         # RowMax,
-    #         Sum,
+    #     assert registry.get_fusion_operators() == [
     #         Average,
     #         Concatenation,
     #         LSTM,
-    #         Multiplication,
-    #     ]:
-    #         assert (
-    #             fusion_operator in registry.get_fusion_operators()
-    #         ), f"{fusion_operator} not in registry"
+    #         RowMax,
+    #         Hadamard,
+    #         Sum,
+    #         AttentionFusion,
+    #     ]
 
 
 if __name__ == "__main__":
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index a73d7b5fcc1..b5d2b266f6f 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -30,11 +30,14 @@
 from systemds.scuro.drsearch.operator_registry import Registry
 from systemds.scuro.models.model import Model
 from systemds.scuro.drsearch.task import Task
-from systemds.scuro.drsearch.unimodal_optimizer import (
-    UnimodalOptimizer,
-)
+from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
 from systemds.scuro.representations.spectrogram import Spectrogram
+from systemds.scuro.representations.covarep_audio_features import (
+    ZeroCrossing,
+    Spectral,
+    Pitch,
+)
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.resnet import ResNet
@@ -176,7 +179,7 @@ def optimize_unimodal_representation_for_modality(self, modality):
             "_representations",
             {
                 ModalityType.TEXT: [W2V],
-                ModalityType.AUDIO: [Spectrogram],
+                ModalityType.AUDIO: [Spectrogram, ZeroCrossing, Spectral, Pitch],
                 ModalityType.TIMESERIES: [ResNet],
                 ModalityType.VIDEO: [ResNet],
                 ModalityType.EMBEDDING: [],
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
index 2f2e64efd7c..52bca501ace 100644
--- a/src/main/python/tests/scuro/test_unimodal_representations.py
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -22,8 +22,18 @@
 import os
 import shutil
 import unittest
+import copy
+import numpy as np
 
 from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.covarep_audio_features import (
+    Spectral,
+    RMSE,
+    Pitch,
+    ZeroCrossing,
+)
+from systemds.scuro.representations.wav2vec import Wav2Vec
+from systemds.scuro.representations.spectrogram import Spectrogram
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.tfidf import TfIdf
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
@@ -31,8 +41,13 @@
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.mfcc import MFCC
 from systemds.scuro.representations.resnet import ResNet
+from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer
 from tests.scuro.data_generator import setup_data
-
+from tests.scuro.data_generator import (
+    setup_data,
+    TestDataLoader,
+    ModalityRandomDataGenerator,
+)
 from systemds.scuro.dataloader.audio_loader import AudioLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.dataloader.text_loader import TextLoader
@@ -50,52 +65,70 @@ class TestUnimodalRepresentations(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.test_file_path = "unimodal_test_data"
-
         cls.num_instances = 4
-        cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-
-        cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path)
-        os.makedirs(f"{cls.test_file_path}/embeddings")
-
-    @classmethod
-    def tearDownClass(cls):
-        print("Cleaning up test data")
-        shutil.rmtree(cls.test_file_path)
+        cls.indices = np.array(range(cls.num_instances))
 
     def test_audio_representations(self):
-        audio_representations = [MFCC()]  # TODO: add FFT, TFN, 1DCNN
-        audio_data_loader = AudioLoader(
-            self.data_generator.get_modality_path(ModalityType.AUDIO),
-            self.data_generator.indices,
+        audio_representations = [
+            MFCC(),
+            MelSpectrogram(),
+            Spectrogram(),
+            Wav2Vec(),
+            Spectral(),
+            ZeroCrossing(),
+            RMSE(),
+            Pitch(),
+        ]  # TODO: add FFT, TFN, 1DCNN
+        audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+            self.num_instances, 1000
+        )
+
+        audio = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+            )
         )
-        audio = UnimodalModality(audio_data_loader)
+
+        audio.extract_raw_data()
+        original_data = copy.deepcopy(audio.data)
 
         for representation in audio_representations:
             r = audio.apply_representation(representation)
             assert r.data is not None
             assert len(r.data) == self.num_instances
+            for i in range(self.num_instances):
+                assert (audio.data[i] == original_data[i]).all()
+            assert r.data[0].ndim == 2
 
     def test_video_representations(self):
-        video_representations = [ResNet()]  # Todo: add other video representations
-        video_data_loader = VideoLoader(
-            self.data_generator.get_modality_path(ModalityType.VIDEO),
-            self.data_generator.indices,
+        video_representations = [
+            ResNet(),
+            SwinVideoTransformer(),
+        ]  # Todo: add other video representations
+        video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
+            self.num_instances, 60
+        )
+        video = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md
+            )
         )
-        video = UnimodalModality(video_data_loader)
         for representation in video_representations:
             r = video.apply_representation(representation)
             assert r.data is not None
             assert len(r.data) == self.num_instances
+            assert r.data[0].ndim == 2
 
     def test_text_representations(self):
         test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()]
-        text_data_loader = TextLoader(
-            self.data_generator.get_modality_path(ModalityType.TEXT),
-            self.data_generator.indices,
+        text_data, text_md = ModalityRandomDataGenerator().create_text_data(
+            self.num_instances
+        )
+        text = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.TEXT, text_data, str, text_md
+            )
         )
-        text = UnimodalModality(text_data_loader)
-
         for representation in test_representations:
             r = text.apply_representation(representation)
             assert r.data is not None
@@ -103,12 +136,14 @@ def test_text_representations(self):
 
     def test_chunked_video_representations(self):
         video_representations = [ResNet()]
-        video_data_loader = VideoLoader(
-            self.data_generator.get_modality_path(ModalityType.VIDEO),
-            self.data_generator.indices,
-            chunk_size=2,
+        video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
+            self.num_instances, 60
+        )
+        video = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md
+            )
         )
-        video = UnimodalModality(video_data_loader)
         for representation in video_representations:
             r = video.apply_representation(representation)
             assert r.data is not None