diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
index e74ae53f364..8b5a8621d1d 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -20,6 +20,7 @@
 # -------------------------------------------------------------
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.image_loader import ImageLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.dataloader.text_loader import TextLoader
 from systemds.scuro.dataloader.json_loader import JSONLoader
@@ -103,6 +104,7 @@
 
 __all__ = [
     "BaseLoader",
+    "ImageLoader",
     "AudioLoader",
     "VideoLoader",
     "TextLoader",
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index d8080c607d0..8b224e6f259 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -47,7 +47,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         if not self.load_data_from_file:
             import numpy as np
 
-            self.metadata[file] = self.modality_type.create_audio_metadata(
+            self.metadata[file] = self.modality_type.create_metadata(
                 1000, np.array([0])
             )
         else:
@@ -56,6 +56,6 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             if self.normalize:
                 audio = librosa.util.normalize(audio)
 
-            self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio)
+            self.metadata[file] = self.modality_type.create_metadata(sr, audio)
 
             self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 33b418efb30..777c72d5d5c 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -34,6 +34,7 @@ def __init__(
         data_type: Union[np.dtype, str],
         chunk_size: Optional[int] = None,
         modality_type=None,
+        ext=None,
     ):
         """
         Base class to load raw data for a given list of indices and stores them in the data object
@@ -53,6 +54,7 @@ def __init__(
         self._num_chunks = 1
         self._chunk_size = None
         self._data_type = data_type
+        self._ext = ext
 
         if chunk_size:
             self.chunk_size = chunk_size
@@ -136,9 +138,10 @@ def get_file_names(self, indices=None):
         is_dir = True if os.path.isdir(self.source_path) else False
         file_names = []
         if is_dir:
-            _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+            if self._ext is None:
+                _, self._ext = os.path.splitext(os.listdir(self.source_path)[0])
             for index in self.indices if indices is None else indices:
-                file_names.append(self.source_path + index + ext)
+                file_names.append(self.source_path + index + self._ext)
             return file_names
         else:
             return self.source_path
@@ -155,10 +158,10 @@ def file_sanity_check(file):
         try:
             file_size = os.path.getsize(file)
         except:
-            raise (f"Error: File {0} not found!".format(file))
+            raise ValueError(f"Error: File {0} not found!".format(file))
 
         if file_size == 0:
-            raise ("File {0} is empty".format(file))
+            raise ValueError("File {0} is empty".format(file))
 
     @staticmethod
     def resolve_data_type(data_type):
diff --git a/src/main/python/systemds/scuro/dataloader/image_loader.py b/src/main/python/systemds/scuro/dataloader/image_loader.py
new file mode 100644
index 00000000000..0667e703b12
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/image_loader.py
@@ -0,0 +1,63 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
+from systemds.scuro.modality.type import ModalityType
+
+
+class ImageLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float16,
+        chunk_size: Optional[int] = None,
+        load=True,
+        ext=".jpg",
+    ):
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.IMAGE, ext
+        )
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+
+        image = cv2.imread(file, cv2.IMREAD_COLOR)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        if image.ndim == 2:
+            height, width = image.shape
+            channels = 1
+        else:
+            height, width, channels = image.shape
+
+        image = image.astype(np.float32) / 255.0
+
+        self.metadata[file] = self.modality_type.create_metadata(
+            width, height, channels
+        )
+
+        self.data.append(image)
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
index a355edded89..89ba6b43d51 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -32,20 +32,28 @@ def __init__(
         self,
         source_path: str,
         indices: List[str],
-        field: str,
+        field: str,  # TODO: make this a list so it is easier to get multiple fields from a json file. (i.e. Mustard: context + sentence)
         data_type: Union[np.dtype, str] = str,
         chunk_size: Optional[int] = None,
+        ext: str = ".json",
     ):
-        super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.TEXT, ext
+        )
         self.field = field
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         with open(file) as f:
             json_file = json.load(f)
+
+            if isinstance(index, str):
+                index = [index]
             for idx in index:
-                sentence = json_file[idx][self.field]
-                self.data.append(sentence)
-                self.metadata[idx] = self.modality_type.create_text_metadata(
-                    len(sentence), sentence
-                )
+                try:
+                    text = json_file[idx][self.field]
+                except:
+                    text = json_file[self.field]
+
+                self.data.append(text)
+                self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
index 6689fb6d92b..ff75a4f3180 100644
--- a/src/main/python/systemds/scuro/dataloader/text_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -43,7 +43,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
-                self.metadata[file] = self.modality_type.create_text_metadata(
+                self.metadata[file] = self.modality_type.create_metadata(
                     len(line.split()), line
                 )
                 self.data.append(line)
diff --git a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
index 6887d6974f2..1e6bfa8fc2c 100644
--- a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
@@ -70,12 +70,12 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             data = self._normalize_signals(data)
 
         if file:
-            self.metadata[index] = self.modality_type.create_ts_metadata(
+            self.metadata[index] = self.modality_type.create_metadata(
                 self.signal_names, data, self.sampling_rate
             )
         else:
             for i, index in enumerate(self.indices):
-                self.metadata[str(index)] = self.modality_type.create_ts_metadata(
+                self.metadata[str(index)] = self.modality_type.create_metadata(
                     self.signal_names, data[i], self.sampling_rate
                 )
         self.data.append(data)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 0e77d5dc57b..2c154ecbafe 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -46,7 +46,7 @@ def __init__(
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         # if not self.load_data_from_file:
-        #     self.metadata[file] = self.modality_type.create_video_metadata(
+        #     self.metadata[file] = self.modality_type.create_metadata(
         #         30, 10, 100, 100, 3
         #     )
         # else:
@@ -67,7 +67,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         num_channels = 3
 
-        self.metadata[file] = self.modality_type.create_video_metadata(
+        self.metadata[file] = self.modality_type.create_metadata(
             self.fps, length, width, height, num_channels
         )
 
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
index 078ce86f7af..442d72fe76f 100644
--- a/src/main/python/systemds/scuro/modality/joined_transformed.py
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -36,7 +36,8 @@ def __init__(self, left_modality, right_modality, transformation):
         :param transformation: Representation to be applied on the modality
         """
         super().__init__(
-            reduce(or_, [left_modality.modality_type], right_modality.modality_type),
+            left_modality.modality_type,
+            # reduce(or_, [left_modality.modality_type], right_modality.modality_type),
             data_type=left_modality.data_type,
         )
         self.transformation = transformation
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
index 7e8e54eff33..f7739f03df9 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -87,7 +87,8 @@ def join(self, right, join_condition):
                 right.extract_raw_data()
 
         joined_modality = JoinedModality(
-            reduce(or_, [right.modality_type], self.modality_type),
+            self.modality_type,
+            # reduce(or_, [right.modality_type], self.modality_type), #TODO
             self,
             right,
             join_condition,
@@ -136,8 +137,10 @@ def combine(self, other: Union[Modality, List[Modality]], fusion_method):
         fused_modality = TransformedModality(
             self, fusion_method, ModalityType.EMBEDDING
         )
+        start_time = time.time()
         fused_modality.data = fusion_method.transform(self.create_modality_list(other))
-
+        end_time = time.time()
+        fused_modality.transform_time = end_time - start_time
         return fused_modality
 
     def combine_with_training(
@@ -147,7 +150,12 @@ def combine_with_training(
             self, fusion_method, ModalityType.EMBEDDING
         )
         modalities = self.create_modality_list(other)
+        start_time = time.time()
         fused_modality.data = fusion_method.transform_with_training(modalities, task)
+        end_time = time.time()
+        fused_modality.transform_time = (
+            end_time - start_time
+        )  # Note: this incldues the training time
 
         return fused_modality
 
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
index 382e2631ad6..c6f713df240 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -195,6 +195,14 @@ class ModalityType(Flag):
     EMBEDDING = auto()
     PHYSIOLOGICAL = auto()
 
+    _metadata_factory_methods = {
+        "TEXT": "create_text_metadata",
+        "AUDIO": "create_audio_metadata",
+        "VIDEO": "create_video_metadata",
+        "IMAGE": "create_image_metadata",
+        "TIMESERIES": "create_ts_metadata",
+    }
+
     def get_schema(self):
         return ModalitySchemas.get(self.name)
 
@@ -215,6 +223,27 @@ def add_field_for_instances(self, md, field, data):
 
         return md
 
+    def create_metadata(self, *args, **kwargs):
+        if self.name is None or "|" in self.name:
+            raise ValueError(
+                f"Composite modality types are not supported for metadata creation: {self}"
+            )
+
+        factory_methods = type(self)._metadata_factory_methods
+        method_name = factory_methods.value.get(self.name)
+        if method_name is None:
+            raise NotImplementedError(
+                f"Metadata creation not implemented for modality type: {self.name}"
+            )
+
+        method = getattr(type(self), method_name, None)
+        if method is None:
+            raise NotImplementedError(
+                f"Metadata creation method '{method_name}' not found for {self.name}"
+            )
+
+        return method(self, *args, **kwargs)
+
     def create_audio_metadata(self, sampling_rate, data, is_single_instance=True):
         md = deepcopy(self.get_schema())
         md = ModalitySchemas.update_base_metadata(md, data, is_single_instance)
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 4ae1067c629..5898ea98c1f 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -72,7 +72,8 @@ def join(self, other, join_condition):
             self.data_loader.update_chunk_sizes(other.data_loader)
 
         joined_modality = JoinedModality(
-            reduce(or_, [other.modality_type], self.modality_type),
+            self.modality_type,
+            # reduce(or_, [other.modality_type], self.modality_type), # TODO
             self,
             other,
             join_condition,
@@ -162,16 +163,21 @@ def apply_representation(self, representation):
                             np.concatenate((embeddings, padding), axis=1)
                         )
                     else:
-                        padded = np.pad(
-                            embeddings,
-                            pad_width=(
-                                (0, padding_needed)
-                                if len(embeddings.shape) == 1
-                                else ((0, padding_needed), (0, 0))
-                            ),
-                            mode="constant",
-                            constant_values=0,
-                        )
+                        if len(embeddings.shape) == 1:
+                            padded = np.zeros(
+                                embeddings.shape[0] + padding_needed,
+                                dtype=embeddings.dtype,
+                            )
+                            padded[: embeddings.shape[0]] = embeddings
+                        else:
+                            padded = np.zeros(
+                                (
+                                    embeddings.shape[0] + padding_needed,
+                                    embeddings.shape[1],
+                                ),
+                                dtype=embeddings.dtype,
+                            )
+                            padded[: embeddings.shape[0], :] = embeddings
                         padded_embeddings.append(padded)
                 else:
                     padded_embeddings.append(embeddings)
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 76769065054..3c43cabb3ee 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -74,19 +74,19 @@ def create1DModality(
         self.modality_type = modality_type
         for i in range(num_instances):
             if modality_type == ModalityType.AUDIO:
-                self.metadata[i] = modality_type.create_audio_metadata(
+                self.metadata[i] = modality_type.create_metadata(
                     num_features / 10, data[i]
                 )
             elif modality_type == ModalityType.TEXT:
-                self.metadata[i] = modality_type.create_text_metadata(
+                self.metadata[i] = modality_type.create_metadata(
                     num_features / 10, data[i]
                 )
             elif modality_type == ModalityType.VIDEO:
-                self.metadata[i] = modality_type.create_video_metadata(
+                self.metadata[i] = modality_type.create_metadata(
                     num_features / 30, 10, 0, 0, 1
                 )
             elif modality_type == ModalityType.TIMESERIES:
-                self.metadata[i] = modality_type.create_ts_metadata(["test"], data[i])
+                self.metadata[i] = modality_type.create_metadata(["test"], data[i])
             else:
                 raise NotImplementedError
 
@@ -96,6 +96,7 @@ def create1DModality(
         return tf_modality
 
     def create_audio_data(self, num_instances, max_audio_length):
+        modality_type = ModalityType.AUDIO
         data = [
             [
                 random.random()
@@ -108,7 +109,7 @@ def create_audio_data(self, num_instances, max_audio_length):
             data[i] = np.array(data[i]).astype(self.data_type)
 
         metadata = {
-            i: ModalityType.AUDIO.create_audio_metadata(16000, np.array(data[i]))
+            i: modality_type.create_metadata(16000, np.array(data[i]))
             for i in range(num_instances)
         }
 
@@ -122,7 +123,7 @@ def create_timeseries_data(self, num_instances, sequence_length, num_features=1)
         if num_features == 1:
             data = [d.squeeze(-1) for d in data]
         metadata = {
-            i: ModalityType.TIMESERIES.create_ts_metadata(
+            i: ModalityType.TIMESERIES.create_metadata(
                 [f"feature_{j}" for j in range(num_features)], data[i]
             )
             for i in range(num_instances)
@@ -186,7 +187,7 @@ def create_text_data(self, num_instances):
             sentences.append(sentence)
 
         metadata = {
-            i: ModalityType.TEXT.create_text_metadata(len(sentences[i]), sentences[i])
+            i: ModalityType.TEXT.create_metadata(len(sentences[i]), sentences[i])
             for i in range(num_instances)
         }
 
@@ -200,13 +201,13 @@ def create_visual_modality(
                 np.random.randint(
                     0,
                     256,
-                    (np.random.randint(1, max_num_frames + 1), height, width, 3),
+                    (np.random.randint(10, max_num_frames + 1), height, width, 3),
                     dtype=np.uint8,
                 )
                 for _ in range(num_instances)
             ]
             metadata = {
-                i: ModalityType.VIDEO.create_video_metadata(
+                i: ModalityType.VIDEO.create_metadata(
                     30, data[i].shape[0], width, height, 3
                 )
                 for i in range(num_instances)
@@ -222,7 +223,7 @@ def create_visual_modality(
                 for _ in range(num_instances)
             ]
             metadata = {
-                i: ModalityType.IMAGE.create_image_metadata(width, height, 3)
+                i: ModalityType.IMAGE.create_metadata(width, height, 3)
                 for i in range(num_instances)
             }