From a60de3968e347379e9969eba4c2d2b8e823bd44c Mon Sep 17 00:00:00 2001 From: Christina Dionysio Date: Wed, 3 Sep 2025 09:31:07 +0200 Subject: [PATCH] add additional context operators --- src/main/python/systemds/scuro/__init__.py | 8 +- .../scuro/drsearch/unimodal_optimizer.py | 4 +- .../systemds/scuro/representations/fusion.py | 2 + .../representations/window_aggregation.py | 107 +++++++++++++++--- .../tests/scuro/test_operator_registry.py | 12 +- .../tests/scuro/test_unimodal_optimizer.py | 2 +- .../tests/scuro/test_window_operations.py | 53 ++++++++- 7 files changed, 159 insertions(+), 29 deletions(-) diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index b2a5e9df377..8e83c865a2a 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -55,7 +55,11 @@ from systemds.scuro.representations.tfidf import TfIdf from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.wav2vec import Wav2Vec -from systemds.scuro.representations.window_aggregation import WindowAggregation +from systemds.scuro.representations.window_aggregation import ( + WindowAggregation, + DynamicWindow, + StaticWindow, +) from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.x3d import X3D from systemds.scuro.models.model import Model @@ -145,4 +149,6 @@ "RMSE", "Spectral", "AttentionFusion", + "DynamicWindow", + "StaticWindow", ] diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py index b84d86d94dd..86c7ce1e63a 100644 --- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py @@ -122,8 +122,8 @@ def _process_modality(self, modality, parallel): for context_operator_after in context_operators: con_op_after = context_operator_after() - mod = mod.context(con_op_after) - self._evaluate_local(mod, [mod_op, con_op_after], local_results) + mod_con = mod.context(con_op_after) + self._evaluate_local(mod_con, [mod_op, con_op_after], local_results) return local_results diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index 61988abba26..ea614ac0955 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -105,6 +105,8 @@ def get_max_embedding_size(self, modalities: List[Modality]): curr_shape = modalities[idx].data[0].shape if len(modalities[idx - 1].data) != len(modalities[idx].data): raise f"Modality sizes don't match!" + elif len(curr_shape) == 1: + continue elif curr_shape[1] > max_size: max_size = curr_shape[1] diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py index 167f4adafea..b3ad9e1b934 100644 --- a/src/main/python/systemds/scuro/representations/window_aggregation.py +++ b/src/main/python/systemds/scuro/representations/window_aggregation.py @@ -18,6 +18,8 @@ # under the License. # # ------------------------------------------------------------- +import copy + import numpy as np import math @@ -28,17 +30,13 @@ from systemds.scuro.representations.context import Context -@register_context_operator() -class WindowAggregation(Context): - def __init__(self, window_size=10, aggregation_function="mean", pad=True): +class Window(Context): + def __init__(self, name, aggregation_function): parameters = { - "window_size": [window_size], "aggregation_function": list(Aggregation().get_aggregation_functions()), - } # TODO: window_size should be dynamic and adapted to the shape of the data - super().__init__("WindowAggregation", parameters) - self.window_size = window_size + } + super().__init__(name, parameters) self.aggregation_function = aggregation_function - self.pad = pad @property def aggregation_function(self): @@ -48,6 +46,15 @@ def aggregation_function(self): def aggregation_function(self, value): self._aggregation_function = Aggregation(value) + +@register_context_operator() +class WindowAggregation(Window): + def __init__(self, window_size=10, aggregation_function="mean", pad=False): + super().__init__("WindowAggregation", aggregation_function) + self.parameters["window_size"] = [window_size] + self.window_size = window_size + self.pad = pad + def execute(self, modality): windowed_data = [] original_lengths = [] @@ -107,24 +114,90 @@ def execute(self, modality): def window_aggregate_single_level(self, instance, new_length): if isinstance(instance, str): return instance - instance = np.array(instance) - num_cols = instance.shape[1] if instance.ndim > 1 else 1 - result = np.empty((new_length, num_cols)) + instance = np.array(copy.deepcopy(instance)) + + result = [] for i in range(0, new_length): - result[i] = self.aggregation_function.aggregate_instance( - instance[i * self.window_size : i * self.window_size + self.window_size] + result.append( + self.aggregation_function.aggregate_instance( + instance[ + i * self.window_size : i * self.window_size + self.window_size + ] + ) ) - if num_cols == 1: - result = result.reshape(-1) - return result + return np.array(result) def window_aggregate_nested_level(self, instance, new_length): result = [[] for _ in range(0, new_length)] - data = np.stack(instance) + data = np.stack(copy.deepcopy(instance)) for i in range(0, new_length): result[i] = self.aggregation_function.aggregate_instance( data[i * self.window_size : i * self.window_size + self.window_size] ) return np.array(result) + + +@register_context_operator() +class StaticWindow(Window): + def __init__(self, num_windows=100, aggregation_function="mean"): + super().__init__("StaticWindow", aggregation_function) + self.parameters["num_windows"] = [num_windows] + self.num_windows = num_windows + + def execute(self, modality): + windowed_data = [] + + for instance in modality.data: + window_size = len(instance) // self.num_windows + remainder = len(instance) % self.num_windows + output = [] + start = 0 + for i in range(0, self.num_windows): + extra = 1 if i < remainder else 0 + end = start + window_size + extra + window = copy.deepcopy(instance[start:end]) + val = ( + self.aggregation_function.aggregate_instance(window) + if len(window) > 0 + else np.zeros_like(output[i - 1]) + ) + output.append(val) + start = end + + windowed_data.append(output) + return np.array(windowed_data) + + +@register_context_operator() +class DynamicWindow(Window): + def __init__(self, num_windows=100, aggregation_function="mean"): + super().__init__("DynamicWindow", aggregation_function) + self.parameters["num_windows"] = [num_windows] + self.num_windows = num_windows + + def execute(self, modality): + windowed_data = [] + + for instance in modality.data: + N = len(instance) + weights = np.geomspace(4, 256, num=self.num_windows) + weights = weights / np.sum(weights) + window_sizes = (weights * N).astype(int) + window_sizes[-1] += N - np.sum(window_sizes) + indices = np.cumsum(window_sizes) + output = [] + start = 0 + for end in indices: + window = copy.deepcopy(instance[start:end]) + val = ( + self.aggregation_function.aggregate_instance(window) + if len(window) > 0 + else np.zeros_like(instance[0]) + ) + output.append(val) + start = end + windowed_data.append(output) + + return np.array(windowed_data) diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index a6941fe618c..0d83d83bda8 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -30,7 +30,11 @@ from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer from systemds.scuro.representations.wav2vec import Wav2Vec -from systemds.scuro.representations.window_aggregation import WindowAggregation +from systemds.scuro.representations.window_aggregation import ( + WindowAggregation, + StaticWindow, + DynamicWindow, +) from systemds.scuro.representations.bow import BoW from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.tfidf import TfIdf @@ -83,7 +87,11 @@ def test_text_representations_in_registry(self): def test_context_operator_in_registry(self): registry = Registry() - assert registry.get_context_operators() == [WindowAggregation] + assert registry.get_context_operators() == [ + WindowAggregation, + StaticWindow, + DynamicWindow, + ] # def test_fusion_operator_in_registry(self): # registry = Registry() diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index b5d2b266f6f..a4952d29f94 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -141,7 +141,7 @@ def setUpClass(cls): def test_unimodal_optimizer_for_audio_modality(self): audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( - self.num_instances, 100 + self.num_instances, 3000 ) audio = UnimodalModality( TestDataLoader( diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py index ea1b0f46f2e..9aab25a8148 100644 --- a/src/main/python/tests/scuro/test_window_operations.py +++ b/src/main/python/tests/scuro/test_window_operations.py @@ -24,8 +24,13 @@ import numpy as np -from tests.scuro.data_generator import ModalityRandomDataGenerator +from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader from systemds.scuro.modality.type import ModalityType +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.representations.window_aggregation import ( + StaticWindow, + DynamicWindow, +) class TestWindowOperations(unittest.TestCase): @@ -35,20 +40,56 @@ def setUpClass(cls): cls.data_generator = ModalityRandomDataGenerator() cls.aggregations = ["mean", "sum", "max", "min"] - def test_window_operations_on_audio_representations(self): + def test_static_window(self): + num_windows = 5 + data, md = self.data_generator.create_visual_modality(self.num_instances, 50) + modality = UnimodalModality( + TestDataLoader( + [i for i in range(0, self.num_instances)], + None, + ModalityType.VIDEO, + data, + np.float32, + md, + ) + ) + aggregated_window = modality.context(StaticWindow(num_windows)) + + for i in range(0, self.num_instances): + assert len(aggregated_window.data[i]) == num_windows + + def test_dynamic_window(self): + num_windows = 5 + data, md = self.data_generator.create_visual_modality(self.num_instances, 50) + modality = UnimodalModality( + TestDataLoader( + [i for i in range(0, self.num_instances)], + None, + ModalityType.VIDEO, + data, + np.float32, + md, + ) + ) + aggregated_window = modality.context(DynamicWindow(num_windows)) + + for i in range(0, self.num_instances): + assert len(aggregated_window.data[i]) == num_windows + + def test_window_aggregation_on_audio_representations(self): window_size = 10 - self.run_window_operations_for_modality(ModalityType.AUDIO, window_size) + self.run_window_aggregation_for_modality(ModalityType.AUDIO, window_size) def test_window_operations_on_video_representations(self): window_size = 10 - self.run_window_operations_for_modality(ModalityType.VIDEO, window_size) + self.run_window_aggregation_for_modality(ModalityType.VIDEO, window_size) def test_window_operations_on_text_representations(self): window_size = 10 - self.run_window_operations_for_modality(ModalityType.TEXT, window_size) + self.run_window_aggregation_for_modality(ModalityType.TEXT, window_size) - def run_window_operations_for_modality(self, modality_type, window_size): + def run_window_aggregation_for_modality(self, modality_type, window_size): r = self.data_generator.create1DModality(40, 100, modality_type) for aggregation in self.aggregations: windowed_modality = r.window_aggregation(window_size, aggregation)