diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index ae9aed44c0a..b2a5e9df377 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -38,6 +38,9 @@ from systemds.scuro.representations.lstm import LSTM from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.multimodal_attention_fusion import ( + AttentionFusion, +) from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.optical_flow import OpticalFlow @@ -73,6 +76,12 @@ from systemds.scuro.drsearch.unimodal_representation_optimizer import ( UnimodalRepresentationOptimizer, ) +from systemds.scuro.representations.covarep_audio_features import ( + RMSE, + Spectral, + ZeroCrossing, + Pitch, +) from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer @@ -131,4 +140,9 @@ "UnimodalRepresentationOptimizer", "UnimodalOptimizer", "MultimodalOptimizer", + "ZeroCrossing", + "Pitch", + "RMSE", + "Spectral", + "AttentionFusion", ] diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index 1197617673f..d8080c607d0 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -19,7 +19,6 @@ # # ------------------------------------------------------------- from typing import List, Optional, Union - import librosa import numpy as np diff --git a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py index 2da8e7ae195..ac4365ed5c6 100644 --- a/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py @@ -98,6 +98,9 @@ def optimize_intramodal_representations(self, task): ], ) + # TODO: check if order matters for reused reps - only compute once - check in cache + # TODO: parallelize - whenever an item of len 0 comes along give it to a new thread - merge results + # TODO: change the algorithm so that one representation is used until there is no more representations to add - saves a lot of memory def optimize_intermodal_representations(self, task): modality_combos = [] n = len(self.k_best_cache[task.model.name]) @@ -122,8 +125,7 @@ def generate_extensions(current_combo, remaining_indices): reuse_fused_representations = False for i, modality_combo in enumerate(modality_combos): # clear reuse cache - if i % 5 == 0: - reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache) + reuse_cache = self.prune_cache(modality_combos[i:], reuse_cache) if i != 0: reuse_fused_representations = self.is_prefix_match( diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py index 3909b51ff98..699dcad8571 100644 --- a/src/main/python/systemds/scuro/drsearch/operator_registry.py +++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py @@ -43,6 +43,12 @@ def __new__(cls): cls._representations[m_type] = [] return cls._instance + def set_fusion_operators(self, fusion_operators): + if isinstance(fusion_operators, list): + self._context_operators = fusion_operators + else: + self._fusion_operators = [fusion_operators] + def add_representation( self, representation: Representation, modality: ModalityType ): @@ -57,6 +63,13 @@ def add_fusion_operator(self, fusion_operator): def get_representations(self, modality: ModalityType): return self._representations[modality] + def get_not_self_contained_representations(self, modality: ModalityType): + reps = [] + for rep in self.get_representations(modality): + if not rep().self_contained: + reps.append(rep) + return reps + def get_context_operators(self): # TODO: return modality specific context operations return self._context_operators diff --git a/src/main/python/systemds/scuro/drsearch/task.py b/src/main/python/systemds/scuro/drsearch/task.py index 7e05a489e44..d08844c7bb6 100644 --- a/src/main/python/systemds/scuro/drsearch/task.py +++ b/src/main/python/systemds/scuro/drsearch/task.py @@ -19,8 +19,10 @@ # # ------------------------------------------------------------- import time -from typing import List +from typing import List, Union +from systemds.scuro.modality.modality import Modality +from systemds.scuro.representations.representation import Representation from systemds.scuro.models.model import Model import numpy as np from sklearn.model_selection import KFold @@ -57,6 +59,8 @@ def __init__( self.inference_time = [] self.training_time = [] self.expected_dim = 1 + self.train_scores = [] + self.val_scores = [] def get_train_test_split(self, data): X_train = [data[i] for i in self.train_indices] @@ -73,28 +77,69 @@ def run(self, data): :param data: The aligned data used in the prediction process :return: the validation accuracy """ + self._reset_params() + skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11) + + fold = 0 + X, y, _, _ = self.get_train_test_split(data) + + for train, test in skf.split(X, y): + train_X = np.array(X)[train] + train_y = np.array(y)[train] + test_X = np.array(X)[test] + test_y = np.array(y)[test] + self._run_fold(train_X, train_y, test_X, test_y) + fold += 1 + + if self.measure_performance: + self.inference_time = np.mean(self.inference_time) + self.training_time = np.mean(self.training_time) + + return [np.mean(self.train_scores), np.mean(self.val_scores)] + + def _reset_params(self): self.inference_time = [] self.training_time = [] + self.train_scores = [] + self.val_scores = [] + + def _run_fold(self, train_X, train_y, test_X, test_y): + train_start = time.time() + train_score = self.model.fit(train_X, train_y, test_X, test_y) + train_end = time.time() + self.training_time.append(train_end - train_start) + self.train_scores.append(train_score) + test_start = time.time() + test_score = self.model.test(np.array(test_X), test_y) + test_end = time.time() + self.inference_time.append(test_end - test_start) + self.val_scores.append(test_score) + + def create_representation_and_run( + self, + representation: Representation, + modalities: Union[List[Modality], Modality], + ): + self._reset_params() skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11) - train_scores = [] - test_scores = [] + fold = 0 - X, y, X_test, y_test = self.get_train_test_split(data) + X, y, _, _ = self.get_train_test_split(data) for train, test in skf.split(X, y): train_X = np.array(X)[train] train_y = np.array(y)[train] - train_start = time.time() - train_score = self.model.fit(train_X, train_y, X_test, y_test) - train_end = time.time() - self.training_time.append(train_end - train_start) - train_scores.append(train_score) - test_start = time.time() - test_score = self.model.test(np.array(X_test), y_test) - test_end = time.time() - self.inference_time.append(test_end - test_start) - test_scores.append(test_score) + test_X = s.transform(np.array(X)[test]) + test_y = np.array(y)[test] + + if isinstance(modalities, Modality): + rep = modality.apply_representation(representation()) + else: + representation().transform( + train_X, train_y + ) # TODO: think about a way how to handle masks + self._run_fold(train_X, train_y, test_X, test_y) fold += 1 if self.measure_performance: diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py index 030f04aa431..b84d86d94dd 100644 --- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py +++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py @@ -20,6 +20,7 @@ # ------------------------------------------------------------- import pickle import time +import copy from concurrent.futures import ProcessPoolExecutor, as_completed from dataclasses import dataclass, field, asdict @@ -28,11 +29,17 @@ import numpy as np from systemds.scuro.representations.window_aggregation import WindowAggregation +from systemds.scuro.representations.concatenation import Concatenation +from systemds.scuro.representations.hadamard import Hadamard +from systemds.scuro.representations.sum import Sum from systemds.scuro.representations.aggregated_representation import ( AggregatedRepresentation, ) -from systemds.scuro import ModalityType, Aggregation +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.representations.aggregate import Aggregation from systemds.scuro.drsearch.operator_registry import Registry from systemds.scuro.utils.schema_helpers import get_shape @@ -84,7 +91,6 @@ def optimize_parallel(self, n_workers=None): def optimize(self): for modality in self.modalities: local_result = self._process_modality(modality, False) - # self._merge_results(local_result) def _process_modality(self, modality, parallel): if parallel: @@ -95,43 +101,59 @@ def _process_modality(self, modality, parallel): local_results = self.operator_performance context_operators = self.operator_registry.get_context_operators() - - for context_operator in context_operators: - context_representation = None - if ( - modality.modality_type != ModalityType.TEXT - and modality.modality_type != ModalityType.VIDEO - ): - con_op = context_operator() - context_representation = modality.context(con_op) - self._evaluate_local(context_representation, [con_op], local_results) - - modality_specific_operators = self.operator_registry.get_representations( + not_self_contained_reps = ( + self.operator_registry.get_not_self_contained_representations( modality.modality_type ) - for modality_specific_operator in modality_specific_operators: - mod_context = None - mod_op = modality_specific_operator() - if context_representation is not None: - mod_context = context_representation.apply_representation(mod_op) - self._evaluate_local(mod_context, [con_op, mod_op], local_results) - - mod = modality.apply_representation(mod_op) - self._evaluate_local(mod, [mod_op], local_results) - - for context_operator_after in context_operators: - con_op_after = context_operator_after() - if mod_context is not None: - mod_context = mod_context.context(con_op_after) - self._evaluate_local( - mod_context, [con_op, mod_op, con_op_after], local_results - ) - - mod = mod.context(con_op_after) - self._evaluate_local(mod, [mod_op, con_op_after], local_results) + ) + modality_specific_operators = self.operator_registry.get_representations( + modality.modality_type + ) + for modality_specific_operator in modality_specific_operators: + mod_op = modality_specific_operator() + + mod = modality.apply_representation(mod_op) + self._evaluate_local(mod, [mod_op], local_results) + + if not mod_op.self_contained: + self._combine_non_self_contained_representations( + modality, mod, not_self_contained_reps, local_results + ) + + for context_operator_after in context_operators: + con_op_after = context_operator_after() + mod = mod.context(con_op_after) + self._evaluate_local(mod, [mod_op, con_op_after], local_results) return local_results + def _combine_non_self_contained_representations( + self, + modality: Modality, + representation: TransformedModality, + other_representations, + local_results, + ): + combined = representation + context_operators = self.operator_registry.get_context_operators() + used_representations = representation.transformation + for other_representation in other_representations: + used_representations.append(other_representation()) + for combination in [Concatenation(), Hadamard(), Sum()]: + combined = combined.combine( + modality.apply_representation(other_representation()), combination + ) + self._evaluate_local( + combined, used_representations, local_results, combination + ) + + for context_op in context_operators: + con_op = context_op() + mod = combined.context(con_op) + c_t = copy.deepcopy(used_representations) + c_t.append(con_op) + self._evaluate_local(mod, c_t, local_results, combination) + def _merge_results(self, local_results): """Merge local results into the main results""" for modality_id in local_results.results: @@ -145,7 +167,9 @@ def _merge_results(self, local_results): for key, value in local_results.cache[modality][task_name].items(): self.operator_performance.cache[modality][task_name][key] = value - def _evaluate_local(self, modality, representations, local_results): + def _evaluate_local( + self, modality, representations, local_results, combination=None + ): if self._tasks_require_same_dims: if self.expected_dimensions == 1 and get_shape(modality.metadata) > 1: # for aggregation in Aggregation().get_aggregation_functions(): @@ -165,6 +189,7 @@ def _evaluate_local(self, modality, representations, local_results): modality, task.model.name, end - start, + combination, ) else: modality.pad() @@ -178,6 +203,7 @@ def _evaluate_local(self, modality, representations, local_results): modality, task.model.name, end - start, + combination, ) else: for task in self.tasks: @@ -198,6 +224,7 @@ def _evaluate_local(self, modality, representations, local_results): modality, task.model.name, end - start, + combination, ) else: # modality.pad() @@ -210,6 +237,7 @@ def _evaluate_local(self, modality, representations, local_results): modality, task.model.name, end - start, + combination, ) @@ -228,7 +256,9 @@ def __init__(self, modalities, tasks, debug=False): self.cache[modality][task_name] = {} self.results[modality][task_name] = [] - def add_result(self, scores, representations, modality, task_name, task_time): + def add_result( + self, scores, representations, modality, task_name, task_time, combination + ): parameters = [] representation_names = [] @@ -256,6 +286,7 @@ def add_result(self, scores, representations, modality, task_name, task_time): val_score=scores[1], representation_time=modality.transform_time, task_time=task_time, + combination=combination.name if combination else "", ) self.results[modality.modality_id][task_name].append(entry) self.cache[modality.modality_id][task_name][ @@ -302,3 +333,4 @@ class ResultEntry: train_score: float representation_time: float task_time: float + combination: str diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index 94e745b2cc1..f1b00fefcfe 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -168,6 +168,6 @@ def is_aligned(self, other_modality): != list(other_modality.metadata.values())[i]["data_layout"]["shape"] ): aligned = False - continue + break return aligned diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index 6523e9502fc..9481937e2ca 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -32,7 +32,9 @@ class TransformedModality(Modality): - def __init__(self, modality, transformation, new_modality_type=None): + def __init__( + self, modality, transformation, new_modality_type=None, self_contained=True + ): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the original modality(ies) @@ -46,8 +48,18 @@ def __init__(self, modality, transformation, new_modality_type=None): new_modality_type, modality.modality_id, metadata, modality.data_type ) self.transformation = None + self.self_contained = ( + self_contained and transformation.self_contained + if isinstance(transformation, TransformedModality) + else True + ) self.add_transformation(transformation, modality) + if modality.__class__.__name__ == "UnimodalModality": + for k, v in self.metadata.items(): + if "attention_masks" in v: + del self.metadata[k]["attention_masks"] + def add_transformation(self, transformation, modality): if ( transformation.__class__.__bases__[0].__name__ == "Fusion" @@ -89,14 +101,18 @@ def join(self, right, join_condition): def window_aggregation(self, windowSize, aggregation): w = WindowAggregation(windowSize, aggregation) - transformed_modality = TransformedModality(self, w) + transformed_modality = TransformedModality( + self, w, self_contained=self.self_contained + ) start = time.time() transformed_modality.data = w.execute(self) transformed_modality.transform_time = time.time() - start return transformed_modality def context(self, context_operator): - transformed_modality = TransformedModality(self, context_operator) + transformed_modality = TransformedModality( + self, context_operator, self_contained=self.self_contained + ) start = time.time() transformed_modality.data = context_operator.execute(self) transformed_modality.transform_time = time.time() - start @@ -107,6 +123,7 @@ def apply_representation(self, representation): new_modality = representation.transform(self) new_modality.update_metadata() new_modality.transform_time = time.time() - start + new_modality.self_contained = representation.self_contained return new_modality def combine(self, other: Union[Modality, List[Modality]], fusion_method): diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 94d1fa057d9..dd1674ea85a 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -110,6 +110,9 @@ def apply_representation(self, representation): self, representation, ) + + pad_dim_one = False + new_modality.data = [] start = time.time() original_lengths = [] @@ -131,26 +134,39 @@ def apply_representation(self, representation): "attention_masks" in entry for entry in new_modality.metadata.values() ): for d in new_modality.data: - original_lengths.append(d.shape[0]) + if d.shape[0] == 1 and d.ndim == 2: + pad_dim_one = True + original_lengths.append(d.shape[1]) + else: + original_lengths.append(d.shape[0]) + + new_modality.data = self.l2_normalize_features(new_modality.data) if len(original_lengths) > 0 and min(original_lengths) < max(original_lengths): target_length = max(original_lengths) padded_embeddings = [] for embeddings in new_modality.data: - current_length = embeddings.shape[0] + current_length = ( + embeddings.shape[0] if not pad_dim_one else embeddings.shape[1] + ) if current_length < target_length: padding_needed = target_length - current_length - - padded = np.pad( - embeddings, - pad_width=( - (0, padding_needed), - (0, 0), - ), - mode="constant", - constant_values=0, - ) - padded_embeddings.append(padded) + if pad_dim_one: + padding = np.zeros((embeddings.shape[0], padding_needed)) + padded_embeddings.append( + np.concatenate((embeddings, padding), axis=1) + ) + else: + padded = np.pad( + embeddings, + pad_width=( + (0, padding_needed), + (0, 0), + ), + mode="constant", + constant_values=0, + ) + padded_embeddings.append(padded) else: padded_embeddings.append(embeddings) @@ -164,4 +180,22 @@ def apply_representation(self, representation): new_modality.data = padded_embeddings new_modality.update_metadata() new_modality.transform_time = time.time() - start + new_modality.self_contained = representation.self_contained return new_modality + + def l2_normalize_features(self, feature_list): + normalized_features = [] + for feature in feature_list: + original_shape = feature.shape + flattened = feature.flatten() + + norm = np.linalg.norm(flattened) + if norm > 0: + normalized_flat = flattened / norm + normalized_feature = normalized_flat.reshape(original_shape) + else: + normalized_feature = feature + + normalized_features.append(normalized_feature) + + return normalized_features diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py index 9412c5be008..9119070a027 100644 --- a/src/main/python/systemds/scuro/representations/aggregated_representation.py +++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py @@ -26,8 +26,11 @@ class AggregatedRepresentation(Representation): def __init__(self, aggregation): super().__init__("AggregatedRepresentation", aggregation.parameters) self.aggregation = aggregation + self.self_contained = True def transform(self, modality): - aggregated_modality = TransformedModality(modality, self) + aggregated_modality = TransformedModality( + modality, self, self_contained=modality.self_contained + ) aggregated_modality.data = self.aggregation.execute(modality) return aggregated_modality diff --git a/src/main/python/systemds/scuro/representations/covarep_audio_features.py b/src/main/python/systemds/scuro/representations/covarep_audio_features.py new file mode 100644 index 00000000000..3b4398cb11b --- /dev/null +++ b/src/main/python/systemds/scuro/representations/covarep_audio_features.py @@ -0,0 +1,156 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import librosa +import numpy as np + +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.modality.transformed import TransformedModality + +from systemds.scuro.representations.unimodal import UnimodalRepresentation +from systemds.scuro.drsearch.operator_registry import register_representation + + +@register_representation(ModalityType.AUDIO) +class Spectral(UnimodalRepresentation): + def __init__(self, hop_length=512): + parameters = { + "hop_length": [256, 512, 1024, 2048], + } # TODO + super().__init__("Spectral", ModalityType.EMBEDDING, parameters, False) + self.hop_length = hop_length + + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + result = [] + for i, y in enumerate(modality.data): + sr = list(modality.metadata.values())[i]["frequency"] + + spectral_centroid = librosa.feature.spectral_centroid( + y=y, sr=sr, hop_length=self.hop_length + ) + spectral_bandwidth = librosa.feature.spectral_bandwidth( + y=y, sr=sr, hop_length=self.hop_length + ) + spectral_rolloff = librosa.feature.spectral_rolloff( + y=y, sr=sr, hop_length=self.hop_length + ) + spectral_flatness = librosa.feature.spectral_flatness( + y=y, hop_length=self.hop_length + ) + + features = np.vstack( + [ + spectral_centroid, + spectral_bandwidth, + spectral_rolloff, + spectral_flatness, + ] + ) + + result.append(features.T) + + transformed_modality.data = result + + return transformed_modality + + +@register_representation(ModalityType.AUDIO) +class ZeroCrossing(UnimodalRepresentation): + def __init__(self, hop_length=512): + parameters = { + "hop_length": [256, 512, 1024, 2048], + } # TODO + super().__init__("ZeroCrossing", ModalityType.EMBEDDING, parameters, False) + self.hop_length = hop_length + + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + result = [] + for i, y in enumerate(modality.data): + zero_crossing_rate = librosa.feature.zero_crossing_rate( + y, hop_length=self.hop_length + ) + + result.append(zero_crossing_rate) + + transformed_modality.data = result + + return transformed_modality + + +@register_representation(ModalityType.AUDIO) +class RMSE(UnimodalRepresentation): + def __init__(self, frame_length=1024, hop_length=512): + parameters = { + "frame_length": [1024, 2048, 4096], + "hop_length": [256, 512, 1024, 2048], + } # TODO + super().__init__("RMSE", ModalityType.EMBEDDING, parameters, False) + self.hop_length = hop_length + self.frame_length = frame_length + + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + result = [] + for i, y in enumerate(modality.data): + rmse = librosa.feature.rms( + y=y, frame_length=self.frame_length, hop_length=self.hop_length + ) + result.append(rmse) + + transformed_modality.data = result + + return transformed_modality + + +@register_representation(ModalityType.AUDIO) +class Pitch(UnimodalRepresentation): + def __init__(self, hop_length=512): + parameters = { + "hop_length": [256, 512, 1024, 2048], + } # TODO + super().__init__("Pitch", ModalityType.EMBEDDING, parameters, False) + self.hop_length = hop_length + + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + result = [] + for i, y in enumerate(modality.data): + sr = list(modality.metadata.values())[i]["frequency"] + + pitches, magnitudes = librosa.piptrack( + y=y, sr=sr, hop_length=self.hop_length + ) + pitch = pitches[magnitudes.argmax(axis=0), np.arange(magnitudes.shape[1])] + + result.append(pitch[np.newaxis, :]) + + transformed_modality.data = result + + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index 4b746eee219..61988abba26 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -38,6 +38,8 @@ def __init__(self, name, parameters=None): self.associative = False self.commutative = False self.needs_alignment = False + self.needs_training = False + self.needs_instance_alignment = False def transform(self, modalities: List[Modality]): """ @@ -58,8 +60,27 @@ def transform(self, modalities: List[Modality]): max_len = self.get_max_embedding_size(mods) for modality in mods: modality.pad(max_len=max_len) + return self.execute(mods) + def transform_with_training( + self, modalities: List[Modality], train_indices, labels + ): + # if self.needs_instance_alignment: + # max_len = self.get_max_embedding_size(modalities) + # for modality in modalities: + # modality.pad(max_len=max_len) + + self.execute( + [np.array(modality.data)[train_indices] for modality in modalities], + labels[train_indices], + ) + + def transform_data(self, modalities: List[Modality], val_indices): + return self.apply_representation( + [np.array(modality.data)[val_indices] for modality in modalities] + ) + def execute(self, modalities: List[Modality]): raise f"Not implemented for Fusion: {self.name}" diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 8e897542b0c..dca1b0eec85 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -36,7 +36,7 @@ def __init__(self, n_mels=128, hop_length=512, n_fft=2048): "hop_length": [256, 512, 1024, 2048], "n_fft": [1024, 2048, 4096], } - super().__init__("MelSpectrogram", ModalityType.TIMESERIES, parameters) + super().__init__("MelSpectrogram", ModalityType.TIMESERIES, parameters, False) self.n_mels = n_mels self.hop_length = hop_length self.n_fft = n_fft @@ -56,9 +56,8 @@ def transform(self, modality): hop_length=self.hop_length, n_fft=self.n_fft, ).astype(modality.data_type) - S_dB = librosa.power_to_db(S, ref=np.max) - result.append(S_dB.T) + result.append(S.T) transformed_modality.data = result return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py index 00f735a756e..c942f3076e7 100644 --- a/src/main/python/systemds/scuro/representations/mfcc.py +++ b/src/main/python/systemds/scuro/representations/mfcc.py @@ -37,7 +37,7 @@ def __init__(self, n_mfcc=12, dct_type=2, n_mels=128, hop_length=512): "hop_length": [256, 512, 1024, 2048], "n_mels": [20, 32, 64, 128], } # TODO - super().__init__("MFCC", ModalityType.TIMESERIES, parameters) + super().__init__("MFCC", ModalityType.TIMESERIES, parameters, False) self.n_mfcc = n_mfcc self.dct_type = dct_type self.n_mels = n_mels diff --git a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py new file mode 100644 index 00000000000..7928b9988bd --- /dev/null +++ b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py @@ -0,0 +1,365 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import List, Dict, Optional +import numpy as np +from systemds.scuro.drsearch.operator_registry import register_fusion_operator +from systemds.scuro.modality.modality import Modality +from systemds.scuro.representations.fusion import Fusion +from systemds.scuro.utils.static_variables import get_device + + +@register_fusion_operator() +class AttentionFusion(Fusion): + def __init__( + self, + hidden_dim=256, + num_heads=8, + dropout=0.1, + fusion_strategy="attention", + batch_size=32, + num_epochs=50, + ): + self.encoder = None + params = { + "hidden_dim": [128, 256, 512], + "num_heads": [1, 4, 8], + "dropout": [0.1, 0.2, 0.3], + "fusion_strategy": ["mean", "max", "attention", "cls"], + "batch_size": [32, 64, 128], + "num_epochs": [50, 70, 100, 150], + } + super().__init__("AttentionFusion", params) + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.dropout = dropout + self.fusion_strategy = fusion_strategy + self.batch_size = batch_size + self.needs_training = True + self.needs_instance_alignment = True + self.num_epochs = num_epochs + + def execute( + self, + data: List[np.ndarray], + labels: np.ndarray, + ): + input_dimension = {} + inputs = {} + max_sequence_length = 0 + masks = {} + for i, modality in enumerate(data): + modality_name = "modality_" + str(i) + shape = modality.shape + max_sequence_length = max(max_sequence_length, shape[1]) + input_dimension[modality_name] = shape[2] if len(shape) > 2 else shape[1] + inputs[modality_name] = torch.from_numpy(np.stack(modality)).to( + get_device() + ) + + # attention_masks_list = [ + # entry["attention_masks"] + # for entry in modality.metadata.values() + # if "attention_masks" in entry + # ] + attention_masks_list = None + if attention_masks_list: + masks[modality_name] = ( + torch.tensor(np.array(attention_masks_list)).bool().to(get_device()) + ) + else: + masks[modality_name] = None + + self.encoder = MultiModalAttentionFusion( + input_dimension, + self.hidden_dim, + self.num_heads, + self.dropout, + max_sequence_length, + self.fusion_strategy, + ) + + head = FusedClassificationHead( + fused_dim=self.hidden_dim, num_classes=len(np.unique(labels)) + ) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam( + list(self.encoder.parameters()) + list(head.parameters()), lr=0.001 + ) + labels = torch.from_numpy(labels).to(get_device()) + + for epoch in range(self.num_epochs): + total_loss = 0 + total_accuracy = 0 + for batch_idx in range(0, len(data), self.batch_size): + batched_input = {} + for modality, modality_data in inputs.items(): + batched_input[modality] = modality_data[ + batch_idx : batch_idx + self.batch_size + ] + loss, predictions = self.train_encoder_step( + head, + inputs, + labels[batch_idx : batch_idx + self.batch_size], + criterion, + optimizer, + ) + total_loss += loss + total_accuracy += predictions + + if epoch % 50 == 0 or epoch == self.num_epochs - 1: + print( + f"Epoch {epoch}, Loss: {total_loss:.4f}, accuracy: {total_accuracy/len(data):.4f}" + ) + + # Training step (encoder + classification head) + def train_encoder_step(self, head, inputs, labels, criterion, optimizer): + self.encoder.train() + head.train() + optimizer.zero_grad() + output = self.encoder(inputs) + logits = head(output["fused"]) + loss = criterion(logits, labels) + loss.backward() + optimizer.step() + _, predicted = torch.max(logits.data, 1) + return loss.item(), (predicted == labels).sum().item() + + def apply_representation(self, modalities): + inputs = {} + for i, modality in enumerate(modalities): + modality_name = "modality_" + str(i) + inputs[modality_name] = torch.from_numpy(np.stack(modality)).to( + get_device() + ) + self.encoder.eval() + with torch.no_grad(): + output = self.encoder(inputs) + return output["fused"].cpu().numpy() + + +class FusedClassificationHead(nn.Module): + """ + Simple classification head for supervision during training. + """ + + def __init__(self, fused_dim, num_classes=2): + super(FusedClassificationHead, self).__init__() + self.head = nn.Sequential( + nn.Linear(fused_dim, fused_dim // 2), + nn.ReLU(), + nn.Linear(fused_dim // 2, num_classes), + ).to(get_device()) + + def forward(self, fused): + return self.head(fused) + + +class MultiModalAttentionFusion(nn.Module): + def __init__( + self, + modality_dims: Dict[str, int], + hidden_dim: int, + num_heads: int, + dropout: float, + max_seq_len: int, + pooling_strategy: str, + ): + super().__init__() + + self.modality_dims = modality_dims + self.hidden_dim = hidden_dim + self.num_heads = num_heads + self.pooling_strategy = pooling_strategy + self.max_seq_len = max_seq_len + + # Project each modality to the same hidden dimension + self.modality_projections = nn.ModuleDict( + { + modality: nn.Linear(dim, hidden_dim).to(get_device()) + for modality, dim in modality_dims.items() + } + ) + + # Positional encoding for sequence modalities + self.positional_encoding = nn.Parameter( + torch.randn(max_seq_len, hidden_dim) * 0.1 + ).to(get_device()) + + # Cross-modal attention + self.cross_attention = nn.MultiheadAttention( + embed_dim=hidden_dim, num_heads=num_heads, dropout=dropout, batch_first=True + ).to(get_device()) + + # Self-attention within each modality + self.self_attention = nn.MultiheadAttention( + embed_dim=hidden_dim, num_heads=num_heads, dropout=dropout, batch_first=True + ).to(get_device()) + + # Attention-based pooling for sequences + if pooling_strategy == "attention": + self.pooling_attention = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.Tanh(), + nn.Linear(hidden_dim // 2, 1), + ).to(get_device()) + + # Modality-level attention for final fusion + self.modality_attention = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim // 2), + nn.ReLU(), + nn.Linear(hidden_dim // 2, 1), + ).to(get_device()) + + # Layer normalization + self.layer_norm = nn.LayerNorm(hidden_dim).to(get_device()) + self.dropout = nn.Dropout(dropout).to(get_device()) + + # Final projection + self.final_projection = nn.Linear(hidden_dim, hidden_dim).to(get_device()) + + def _handle_input_format(self, modality_tensor: torch.Tensor) -> torch.Tensor: + if len(modality_tensor.shape) == 2: + modality_tensor = modality_tensor.unsqueeze(1) + elif len(modality_tensor.shape) == 3: + pass + else: + raise ValueError( + f"Input tensor must be 2D or 3D, got {len(modality_tensor.shape)}D" + ) + + if modality_tensor.dtype != torch.float: + modality_tensor = modality_tensor.float() + + return modality_tensor + + def _pool_sequence( + self, sequence: torch.Tensor, mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + if self.pooling_strategy == "mean": + if mask is not None: + # Masked mean pooling + masked_seq = sequence * mask.unsqueeze(-1) + return masked_seq.sum(dim=1) / mask.sum(dim=1, keepdim=True).clamp( + min=1 + ) + else: + return sequence.mean(dim=1) + + elif self.pooling_strategy == "max": + if mask is not None: + # Set masked positions to large negative value before max pooling + masked_seq = sequence.masked_fill(~mask.unsqueeze(-1), float("-inf")) + return masked_seq.max(dim=1)[0] + else: + return sequence.max(dim=1)[0] + + elif self.pooling_strategy == "cls": + # Use the first token (assuming it's a CLS token) + return sequence[:, 0, :] + + elif self.pooling_strategy == "attention": + # Attention-based pooling + attention_scores = self.pooling_attention(sequence).squeeze( + -1 + ) # (batch, seq) + + if mask is not None: + attention_scores = attention_scores.masked_fill(~mask, float("-inf")) + + attention_weights = F.softmax(attention_scores, dim=1) # (batch, seq) + return (sequence * attention_weights.unsqueeze(-1)).sum( + dim=1 + ) # (batch, hidden) + + else: + raise ValueError(f"Unknown pooling strategy: {self.pooling_strategy}") + + def forward( + self, + modality_inputs: Dict[str, torch.Tensor], + modality_masks: Optional[Dict[str, torch.Tensor]] = None, + ) -> Dict[str, torch.Tensor]: + modality_embeddings = {} + + for modality, input_tensor in modality_inputs.items(): + normalized_input = self._handle_input_format(input_tensor) + seq_len = normalized_input.size(1) + + projected = self.modality_projections[modality](normalized_input) + + if seq_len > 1: + pos_encoding = self.positional_encoding[:seq_len].unsqueeze(0) + projected = projected + pos_encoding + + if seq_len > 1: + mask = modality_masks.get(modality) if modality_masks else None + + attended, _ = self.self_attention( + query=projected, + key=projected, + value=projected, + key_padding_mask=~mask if mask is not None else None, + ) + + projected = self.layer_norm(projected + self.dropout(attended)) + + pooled = self._pool_sequence(projected, mask) + else: + pooled = projected.squeeze(1) + + modality_embeddings[modality] = pooled + + if len(modality_embeddings) > 1: + modality_stack = torch.stack(list(modality_embeddings.values()), dim=1) + + cross_attended, cross_attention_weights = self.cross_attention( + query=modality_stack, key=modality_stack, value=modality_stack + ) + + cross_attended = self.layer_norm( + modality_stack + self.dropout(cross_attended) + ) + + updated_embeddings = { + modality: cross_attended[:, i, :] + for i, modality in enumerate(modality_embeddings.keys()) + } + modality_embeddings = updated_embeddings + + modality_stack = torch.stack(list(modality_embeddings.values()), dim=1) + modality_scores = self.modality_attention(modality_stack).squeeze(-1) + modality_weights = F.softmax(modality_scores, dim=1) + + fused_representation = (modality_stack * modality_weights.unsqueeze(-1)).sum( + dim=1 + ) + + output = self.final_projection(fused_representation) + + return { + "fused": output, + "modality_embeddings": modality_embeddings, + "attention_weights": modality_weights, + } diff --git a/src/main/python/systemds/scuro/representations/representation.py b/src/main/python/systemds/scuro/representations/representation.py index 6137baf46dc..144b88f34c0 100644 --- a/src/main/python/systemds/scuro/representations/representation.py +++ b/src/main/python/systemds/scuro/representations/representation.py @@ -25,6 +25,7 @@ class Representation: def __init__(self, name, parameters): self.name = name self._parameters = parameters + self.self_contained = True @property def parameters(self): diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index bdfbfb17fc0..f961cb4588a 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -29,13 +29,7 @@ import torchvision.models as models import numpy as np from systemds.scuro.modality.type import ModalityType - -if torch.backends.mps.is_available(): - DEVICE = torch.device("mps") -elif torch.cuda.is_available(): - DEVICE = torch.device("cuda") -else: - DEVICE = torch.device("cpu") +from systemds.scuro.utils.static_variables import get_device @register_representation( @@ -72,33 +66,33 @@ def model_name(self, model_name): if model_name == "ResNet18": self.model = ( models.resnet18(weights=models.ResNet18_Weights.DEFAULT) - .to(DEVICE) + .to(get_device()) .to(self.data_type) ) elif model_name == "ResNet34": self.model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to( - DEVICE + get_device() ) self.model = self.model.to(self.data_type) elif model_name == "ResNet50": self.model = ( models.resnet50(weights=models.ResNet50_Weights.DEFAULT) - .to(DEVICE) + .to(get_device()) .to(self.data_type) ) elif model_name == "ResNet101": self.model = ( models.resnet101(weights=models.ResNet101_Weights.DEFAULT) - .to(DEVICE) + .to(get_device()) .to(self.data_type) ) elif model_name == "ResNet152": self.model = ( models.resnet152(weights=models.ResNet152_Weights.DEFAULT) - .to(DEVICE) + .to(get_device()) .to(self.data_type) ) @@ -129,7 +123,7 @@ def transform(self, modality): if next(self.model.parameters()).dtype != self.data_type: self.model = self.model.to(self.data_type) - dataset = CustomDataset(modality.data, self.data_type, DEVICE) + dataset = CustomDataset(modality.data, self.data_type, get_device()) embeddings = {} res5c_output = None diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py index 8daa9abb015..51b69d7d87c 100644 --- a/src/main/python/systemds/scuro/representations/spectrogram.py +++ b/src/main/python/systemds/scuro/representations/spectrogram.py @@ -32,7 +32,7 @@ class Spectrogram(UnimodalRepresentation): def __init__(self, hop_length=512, n_fft=2048): parameters = {"hop_length": [256, 512, 1024, 2048], "n_fft": [1024, 2048, 4096]} - super().__init__("Spectrogram", ModalityType.TIMESERIES, parameters) + super().__init__("Spectrogram", ModalityType.TIMESERIES, parameters, False) self.hop_length = hop_length self.n_fft = n_fft @@ -48,7 +48,7 @@ def transform(self, modality): ).astype(modality.data_type) S_dB = librosa.amplitude_to_db(np.abs(spectrogram)) - result.append(S_dB.T.reshape(-1)) + result.append(S_dB.T) transformed_modality.data = result return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/swin_video_transformer.py b/src/main/python/systemds/scuro/representations/swin_video_transformer.py index 19b2fd05c4f..c0b7ab38ab0 100644 --- a/src/main/python/systemds/scuro/representations/swin_video_transformer.py +++ b/src/main/python/systemds/scuro/representations/swin_video_transformer.py @@ -18,7 +18,7 @@ # under the License. # # ------------------------------------------------------------- -# from torchvision.models.video.swin_transformer import swin3d_t +from torchvision.models.video.swin_transformer import swin3d_t from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation @@ -31,13 +31,7 @@ from systemds.scuro.drsearch.operator_registry import register_representation from systemds.scuro.utils.torch_dataset import CustomDataset - -if torch.backends.mps.is_available(): - DEVICE = torch.device("mps") -# elif torch.cuda.is_available(): -# DEVICE = torch.device("cuda") -else: - DEVICE = torch.device("cpu") +from systemds.scuro.utils.static_variables import get_device # @register_representation([ModalityType.VIDEO]) @@ -55,16 +49,17 @@ def __init__(self, layer_name="avgpool"): "avgpool", ], } + self.data_type = torch.float super().__init__("SwinVideoTransformer", ModalityType.TIMESERIES, parameters) self.layer_name = layer_name - # self.model = swin3d_t(weights=models.video.Swin3D_T_Weights).to(DEVICE) + self.model = swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1).to( + get_device() + ) self.model.eval() for param in self.model.parameters(): param.requires_grad = False def transform(self, modality): - # model = swin3d_t(weights=models.video.Swin3D_T_Weights) - embeddings = {} swin_output = None @@ -82,11 +77,11 @@ def hook( if name == self.layer_name: layer.register_forward_hook(get_features(name)) break - dataset = CustomDataset(modality.data) + dataset = CustomDataset(modality.data, self.data_type, get_device()) - for instance in dataset: - video_id = instance["id"] - frames = instance["data"].to(DEVICE) + for instance in torch.utils.data.DataLoader(dataset): + video_id = instance["id"][0] + frames = instance["data"][0] embeddings[video_id] = [] frames = frames.unsqueeze(0).permute(0, 2, 1, 3, 4) @@ -95,15 +90,18 @@ def hook( values = swin_output pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) - embeddings[video_id].extend(torch.flatten(pooled, 1).detach().cpu().numpy()) + embeddings[video_id].extend( + torch.flatten(pooled, 1) + .detach() + .cpu() + .numpy() + .astype(modality.data_type) + ) embeddings[video_id] = np.array(embeddings[video_id]) transformed_modality = TransformedModality( - self.output_modality_type, - "swinVideoTransformer", - modality.modality_id, - modality.metadata, + modality, self, self.output_modality_type ) transformed_modality.data = list(embeddings.values()) diff --git a/src/main/python/systemds/scuro/representations/unimodal.py b/src/main/python/systemds/scuro/representations/unimodal.py index 559eec1401c..362888aa278 100644 --- a/src/main/python/systemds/scuro/representations/unimodal.py +++ b/src/main/python/systemds/scuro/representations/unimodal.py @@ -24,7 +24,9 @@ class UnimodalRepresentation(Representation): - def __init__(self, name: str, output_modality_type, parameters=None): + def __init__( + self, name: str, output_modality_type, parameters=None, self_contained=True + ): """ Parent class for all unimodal representation types :param name: name of the representation @@ -35,6 +37,7 @@ def __init__(self, name: str, output_modality_type, parameters=None): self.output_modality_type = output_modality_type if parameters is None: parameters = {} + self.self_contained = self_contained @abc.abstractmethod def transform(self, data): diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index 88d60ac828b..06e082fb695 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -41,7 +41,7 @@ def get_embedding(sentence, model): @register_representation(ModalityType.TEXT) class W2V(UnimodalRepresentation): - def __init__(self, vector_size=3, min_count=2, window=2, output_file=None): + def __init__(self, vector_size=150, min_count=2, window=5, output_file=None): parameters = { "vector_size": [vector_size], "min_count": [min_count], diff --git a/src/main/python/systemds/scuro/utils/static_variables.py b/src/main/python/systemds/scuro/utils/static_variables.py index 8237cdf1b3e..807287cd95a 100644 --- a/src/main/python/systemds/scuro/utils/static_variables.py +++ b/src/main/python/systemds/scuro/utils/static_variables.py @@ -32,5 +32,6 @@ def get_device(): return torch.device( "cuda:0" if torch.cuda.is_available() - else "mps" if torch.mps.is_available() else "cpu" + # else "mps" if torch.mps.is_available() + else "cpu" ) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index e57716fa99d..4dcfa5a89ce 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -95,10 +95,16 @@ def create1DModality( def create_audio_data(self, num_instances, max_audio_length): data = [ - [random.random() for _ in range(random.randint(1, max_audio_length))] + [ + random.random() + for _ in range(random.randint(max_audio_length * 0.9, max_audio_length)) + ] for _ in range(num_instances) ] + for i in range(num_instances): + data[i] = np.array(data[i]).astype(self.data_type) + metadata = { i: ModalityType.AUDIO.create_audio_metadata(16000, np.array(data[i])) for i in range(num_instances) diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index 7f2a752722a..a6941fe618c 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -21,7 +21,14 @@ import unittest +from systemds.scuro.representations.covarep_audio_features import ( + ZeroCrossing, + Spectral, + Pitch, + RMSE, +) from systemds.scuro.representations.mfcc import MFCC +from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer from systemds.scuro.representations.wav2vec import Wav2Vec from systemds.scuro.representations.window_aggregation import WindowAggregation from systemds.scuro.representations.bow import BoW @@ -39,19 +46,29 @@ from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum +from systemds.scuro.representations.multimodal_attention_fusion import AttentionFusion class TestOperatorRegistry(unittest.TestCase): def test_audio_representations_in_registry(self): registry = Registry() - for representation in [Spectrogram, MelSpectrogram, Wav2Vec, MFCC]: - assert representation in registry.get_representations( - ModalityType.AUDIO - ), f"{representation} not in registry" + assert registry.get_representations(ModalityType.AUDIO) == [ + MelSpectrogram, + MFCC, + Spectrogram, + Wav2Vec, + Spectral, + ZeroCrossing, + RMSE, + Pitch, + ] def test_video_representations_in_registry(self): registry = Registry() - assert registry.get_representations(ModalityType.VIDEO) == [ResNet] + assert registry.get_representations(ModalityType.VIDEO) == [ + ResNet, + # SwinVideoTransformer, + ] def test_timeseries_representations_in_registry(self): registry = Registry() @@ -70,17 +87,15 @@ def test_context_operator_in_registry(self): # def test_fusion_operator_in_registry(self): # registry = Registry() - # for fusion_operator in [ - # # RowMax, - # Sum, + # assert registry.get_fusion_operators() == [ # Average, # Concatenation, # LSTM, - # Multiplication, - # ]: - # assert ( - # fusion_operator in registry.get_fusion_operators() - # ), f"{fusion_operator} not in registry" + # RowMax, + # Hadamard, + # Sum, + # AttentionFusion, + # ] if __name__ == "__main__": diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index a73d7b5fcc1..b5d2b266f6f 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -30,11 +30,14 @@ from systemds.scuro.drsearch.operator_registry import Registry from systemds.scuro.models.model import Model from systemds.scuro.drsearch.task import Task -from systemds.scuro.drsearch.unimodal_optimizer import ( - UnimodalOptimizer, -) +from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer from systemds.scuro.representations.spectrogram import Spectrogram +from systemds.scuro.representations.covarep_audio_features import ( + ZeroCrossing, + Spectral, + Pitch, +) from systemds.scuro.representations.word2vec import W2V from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet @@ -176,7 +179,7 @@ def optimize_unimodal_representation_for_modality(self, modality): "_representations", { ModalityType.TEXT: [W2V], - ModalityType.AUDIO: [Spectrogram], + ModalityType.AUDIO: [Spectrogram, ZeroCrossing, Spectral, Pitch], ModalityType.TIMESERIES: [ResNet], ModalityType.VIDEO: [ResNet], ModalityType.EMBEDDING: [], diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index 2f2e64efd7c..52bca501ace 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -22,8 +22,18 @@ import os import shutil import unittest +import copy +import numpy as np from systemds.scuro.representations.bow import BoW +from systemds.scuro.representations.covarep_audio_features import ( + Spectral, + RMSE, + Pitch, + ZeroCrossing, +) +from systemds.scuro.representations.wav2vec import Wav2Vec +from systemds.scuro.representations.spectrogram import Spectrogram from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.tfidf import TfIdf from systemds.scuro.modality.unimodal_modality import UnimodalModality @@ -31,8 +41,13 @@ from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.resnet import ResNet +from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer from tests.scuro.data_generator import setup_data - +from tests.scuro.data_generator import ( + setup_data, + TestDataLoader, + ModalityRandomDataGenerator, +) from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.dataloader.text_loader import TextLoader @@ -50,52 +65,70 @@ class TestUnimodalRepresentations(unittest.TestCase): @classmethod def setUpClass(cls): - cls.test_file_path = "unimodal_test_data" - cls.num_instances = 4 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) - os.makedirs(f"{cls.test_file_path}/embeddings") - - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) + cls.indices = np.array(range(cls.num_instances)) def test_audio_representations(self): - audio_representations = [MFCC()] # TODO: add FFT, TFN, 1DCNN - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, + audio_representations = [ + MFCC(), + MelSpectrogram(), + Spectrogram(), + Wav2Vec(), + Spectral(), + ZeroCrossing(), + RMSE(), + Pitch(), + ] # TODO: add FFT, TFN, 1DCNN + audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( + self.num_instances, 1000 + ) + + audio = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md + ) ) - audio = UnimodalModality(audio_data_loader) + + audio.extract_raw_data() + original_data = copy.deepcopy(audio.data) for representation in audio_representations: r = audio.apply_representation(representation) assert r.data is not None assert len(r.data) == self.num_instances + for i in range(self.num_instances): + assert (audio.data[i] == original_data[i]).all() + assert r.data[0].ndim == 2 def test_video_representations(self): - video_representations = [ResNet()] # Todo: add other video representations - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, + video_representations = [ + ResNet(), + SwinVideoTransformer(), + ] # Todo: add other video representations + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) ) - video = UnimodalModality(video_data_loader) for representation in video_representations: r = video.apply_representation(representation) assert r.data is not None assert len(r.data) == self.num_instances + assert r.data[0].ndim == 2 def test_text_representations(self): test_representations = [BoW(2, 2), W2V(5, 2, 2), TfIdf(2), Bert()] - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, + text_data, text_md = ModalityRandomDataGenerator().create_text_data( + self.num_instances + ) + text = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.TEXT, text_data, str, text_md + ) ) - text = UnimodalModality(text_data_loader) - for representation in test_representations: r = text.apply_representation(representation) assert r.data is not None @@ -103,12 +136,14 @@ def test_text_representations(self): def test_chunked_video_representations(self): video_representations = [ResNet()] - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - chunk_size=2, + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) ) - video = UnimodalModality(video_data_loader) for representation in video_representations: r = video.apply_representation(representation) assert r.data is not None