[SYSTEMDS-3936] Refactor metadata generation and add additional dataloader

christinadionysio · web-flow · commit 6658c8fdb441 · 2025-11-27T15:26:25.000+01:00
This patch introduces a new dataloader for the image modality. Additionally, it refines the way how the metadata for each modality is created.
diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
@@ -20,6 +20,7 @@
 # -------------------------------------------------------------
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.image_loader import ImageLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.dataloader.text_loader import TextLoader
 from systemds.scuro.dataloader.json_loader import JSONLoader
@@ -103,6 +104,7 @@
 
 __all__ = [
     "BaseLoader",
+    "ImageLoader",
     "AudioLoader",
     "VideoLoader",
     "TextLoader",
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -47,7 +47,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         if not self.load_data_from_file:
             import numpy as np
 
-            self.metadata[file] = self.modality_type.create_audio_metadata(
+            self.metadata[file] = self.modality_type.create_metadata(
                 1000, np.array([0])
             )
         else:
@@ -56,6 +56,6 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             if self.normalize:
                 audio = librosa.util.normalize(audio)
 
-            self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio)
+            self.metadata[file] = self.modality_type.create_metadata(sr, audio)
 
             self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -34,6 +34,7 @@ def __init__(
         data_type: Union[np.dtype, str],
         chunk_size: Optional[int] = None,
         modality_type=None,
+        ext=None,
     ):
         """
         Base class to load raw data for a given list of indices and stores them in the data object
@@ -53,6 +54,7 @@ def __init__(
         self._num_chunks = 1
         self._chunk_size = None
         self._data_type = data_type
+        self._ext = ext
 
         if chunk_size:
             self.chunk_size = chunk_size
@@ -136,9 +138,10 @@ def get_file_names(self, indices=None):
         is_dir = True if os.path.isdir(self.source_path) else False
         file_names = []
         if is_dir:
-            _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+            if self._ext is None:
+                _, self._ext = os.path.splitext(os.listdir(self.source_path)[0])
             for index in self.indices if indices is None else indices:
-                file_names.append(self.source_path + index + ext)
+                file_names.append(self.source_path + index + self._ext)
             return file_names
         else:
             return self.source_path
@@ -155,10 +158,10 @@ def file_sanity_check(file):
         try:
             file_size = os.path.getsize(file)
         except:
-            raise (f"Error: File {0} not found!".format(file))
+            raise ValueError(f"Error: File {0} not found!".format(file))
 
         if file_size == 0:
-            raise ("File {0} is empty".format(file))
+            raise ValueError("File {0} is empty".format(file))
 
     @staticmethod
     def resolve_data_type(data_type):
diff --git a/src/main/python/systemds/scuro/dataloader/image_loader.py b/src/main/python/systemds/scuro/dataloader/image_loader.py
@@ -0,0 +1,63 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional, Union
+
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
+from systemds.scuro.modality.type import ModalityType
+
+
+class ImageLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        data_type: Union[np.dtype, str] = np.float16,
+        chunk_size: Optional[int] = None,
+        load=True,
+        ext=".jpg",
+    ):
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.IMAGE, ext
+        )
+        self.load_data_from_file = load
+
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        self.file_sanity_check(file)
+
+        image = cv2.imread(file, cv2.IMREAD_COLOR)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        if image.ndim == 2:
+            height, width = image.shape
+            channels = 1
+        else:
+            height, width, channels = image.shape
+
+        image = image.astype(np.float32) / 255.0
+
+        self.metadata[file] = self.modality_type.create_metadata(
+            width, height, channels
+        )
+
+        self.data.append(image)
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -32,20 +32,28 @@ def __init__(
         self,
         source_path: str,
         indices: List[str],
-        field: str,
+        field: str,  # TODO: make this a list so it is easier to get multiple fields from a json file. (i.e. Mustard: context + sentence)
         data_type: Union[np.dtype, str] = str,
         chunk_size: Optional[int] = None,
+        ext: str = ".json",
     ):
-        super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT)
+        super().__init__(
+            source_path, indices, data_type, chunk_size, ModalityType.TEXT, ext
+        )
         self.field = field
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         with open(file) as f:
             json_file = json.load(f)
+
+            if isinstance(index, str):
+                index = [index]
             for idx in index:
-                sentence = json_file[idx][self.field]
-                self.data.append(sentence)
-                self.metadata[idx] = self.modality_type.create_text_metadata(
-                    len(sentence), sentence
-                )
+                try:
+                    text = json_file[idx][self.field]
+                except:
+                    text = json_file[self.field]
+
+                self.data.append(text)
+                self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -43,7 +43,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
                 if self.prefix:
                     line = re.sub(self.prefix, "", line)
                 line = line.replace("\n", "")
-                self.metadata[file] = self.modality_type.create_text_metadata(
+                self.metadata[file] = self.modality_type.create_metadata(
                     len(line.split()), line
                 )
                 self.data.append(line)
diff --git a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
@@ -70,12 +70,12 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             data = self._normalize_signals(data)
 
         if file:
-            self.metadata[index] = self.modality_type.create_ts_metadata(
+            self.metadata[index] = self.modality_type.create_metadata(
                 self.signal_names, data, self.sampling_rate
             )
         else:
             for i, index in enumerate(self.indices):
-                self.metadata[str(index)] = self.modality_type.create_ts_metadata(
+                self.metadata[str(index)] = self.modality_type.create_metadata(
                     self.signal_names, data[i], self.sampling_rate
                 )
         self.data.append(data)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -46,7 +46,7 @@ def __init__(
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
         # if not self.load_data_from_file:
-        #     self.metadata[file] = self.modality_type.create_video_metadata(
+        #     self.metadata[file] = self.modality_type.create_metadata(
         #         30, 10, 100, 100, 3
         #     )
         # else:
@@ -67,7 +67,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
         num_channels = 3
 
-        self.metadata[file] = self.modality_type.create_video_metadata(
+        self.metadata[file] = self.modality_type.create_metadata(
             self.fps, length, width, height, num_channels
         )
 
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -36,7 +36,8 @@ def __init__(self, left_modality, right_modality, transformation):
         :param transformation: Representation to be applied on the modality
         """
         super().__init__(
-            reduce(or_, [left_modality.modality_type], right_modality.modality_type),
+            left_modality.modality_type,
+            # reduce(or_, [left_modality.modality_type], right_modality.modality_type),
             data_type=left_modality.data_type,
         )
         self.transformation = transformation
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
@@ -87,7 +87,8 @@ def join(self, right, join_condition):
                 right.extract_raw_data()
 
         joined_modality = JoinedModality(
-            reduce(or_, [right.modality_type], self.modality_type),
+            self.modality_type,
+            # reduce(or_, [right.modality_type], self.modality_type), #TODO
             self,
             right,
             join_condition,
@@ -136,8 +137,10 @@ def combine(self, other: Union[Modality, List[Modality]], fusion_method):
         fused_modality = TransformedModality(
             self, fusion_method, ModalityType.EMBEDDING
         )
+        start_time = time.time()
         fused_modality.data = fusion_method.transform(self.create_modality_list(other))
-
+        end_time = time.time()
+        fused_modality.transform_time = end_time - start_time
         return fused_modality
 
     def combine_with_training(
@@ -147,7 +150,12 @@ def combine_with_training(
             self, fusion_method, ModalityType.EMBEDDING
         )
         modalities = self.create_modality_list(other)
+        start_time = time.time()
         fused_modality.data = fusion_method.transform_with_training(modalities, task)
+        end_time = time.time()
+        fused_modality.transform_time = (
+            end_time - start_time
+        )  # Note: this incldues the training time
 
         return fused_modality
 
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
@@ -195,6 +195,14 @@ class ModalityType(Flag):
     EMBEDDING = auto()
     PHYSIOLOGICAL = auto()
 
+    _metadata_factory_methods = {
+        "TEXT": "create_text_metadata",
+        "AUDIO": "create_audio_metadata",
+        "VIDEO": "create_video_metadata",
+        "IMAGE": "create_image_metadata",
+        "TIMESERIES": "create_ts_metadata",
+    }
+
     def get_schema(self):
         return ModalitySchemas.get(self.name)
 
@@ -215,6 +223,27 @@ def add_field_for_instances(self, md, field, data):
 
         return md
 
+    def create_metadata(self, *args, **kwargs):
+        if self.name is None or "|" in self.name:
+            raise ValueError(
+                f"Composite modality types are not supported for metadata creation: {self}"
+            )
+
+        factory_methods = type(self)._metadata_factory_methods
+        method_name = factory_methods.value.get(self.name)
+        if method_name is None:
+            raise NotImplementedError(
+                f"Metadata creation not implemented for modality type: {self.name}"
+            )
+
+        method = getattr(type(self), method_name, None)
+        if method is None:
+            raise NotImplementedError(
+                f"Metadata creation method '{method_name}' not found for {self.name}"
+            )
+
+        return method(self, *args, **kwargs)
+
     def create_audio_metadata(self, sampling_rate, data, is_single_instance=True):
         md = deepcopy(self.get_schema())
         md = ModalitySchemas.update_base_metadata(md, data, is_single_instance)
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -72,7 +72,8 @@ def join(self, other, join_condition):
             self.data_loader.update_chunk_sizes(other.data_loader)
 
         joined_modality = JoinedModality(
-            reduce(or_, [other.modality_type], self.modality_type),
+            self.modality_type,
+            # reduce(or_, [other.modality_type], self.modality_type), # TODO
             self,
             other,
             join_condition,
@@ -162,16 +163,21 @@ def apply_representation(self, representation):
                             np.concatenate((embeddings, padding), axis=1)
                         )
                     else:
-                        padded = np.pad(
-                            embeddings,
-                            pad_width=(
-                                (0, padding_needed)
-                                if len(embeddings.shape) == 1
-                                else ((0, padding_needed), (0, 0))
-                            ),
-                            mode="constant",
-                            constant_values=0,
-                        )
+                        if len(embeddings.shape) == 1:
+                            padded = np.zeros(
+                                embeddings.shape[0] + padding_needed,
+                                dtype=embeddings.dtype,
+                            )
+                            padded[: embeddings.shape[0]] = embeddings
+                        else:
+                            padded = np.zeros(
+                                (
+                                    embeddings.shape[0] + padding_needed,
+                                    embeddings.shape[1],
+                                ),
+                                dtype=embeddings.dtype,
+                            )
+                            padded[: embeddings.shape[0], :] = embeddings
                         padded_embeddings.append(padded)
                 else:
                     padded_embeddings.append(embeddings)
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):`
`43`	`43`	`if self.prefix:`
`44`	`44`	`line = re.sub(self.prefix, "", line)`
`45`	`45`	`line = line.replace("\n", "")`
`46`		`- self.metadata[file] = self.modality_type.create_text_metadata(`
	`46`	`+ self.metadata[file] = self.modality_type.create_metadata(`
`47`	`47`	`len(line.split()), line`
`48`	`48`	`)`
`49`	`49`	`self.data.append(line)`
Original file line number	Diff line number	Diff line change
`@@ -70,12 +70,12 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):`
`70`	`70`	`data = self._normalize_signals(data)`
`71`	`71`
`72`	`72`	`if file:`
`73`		`- self.metadata[index] = self.modality_type.create_ts_metadata(`
	`73`	`+ self.metadata[index] = self.modality_type.create_metadata(`
`74`	`74`	`self.signal_names, data, self.sampling_rate`
`75`	`75`	`)`
`76`	`76`	`else:`
`77`	`77`	`for i, index in enumerate(self.indices):`
`78`		`- self.metadata[str(index)] = self.modality_type.create_ts_metadata(`
	`78`	`+ self.metadata[str(index)] = self.modality_type.create_metadata(`
`79`	`79`	`self.signal_names, data[i], self.sampling_rate`
`80`	`80`	`)`
`81`	`81`	`self.data.append(data)`