diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index bf20101e6c9..fcd8bf8c849 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -171,7 +171,8 @@ jobs: h5py \ gensim \ opt-einsum \ - nltk + nltk \ + fvcore kill $KA cd src/main/python - python -m unittest discover -s tests/scuro -p 'test_*.py' -v \ No newline at end of file + python -m unittest discover -s tests/scuro -p 'test_*.py' -v diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index c1db4c3d49e..e74ae53f364 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -77,6 +77,7 @@ ) from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.x3d import X3D +from systemds.scuro.representations.color_histogram import ColorHistogram from systemds.scuro.models.model import Model from systemds.scuro.models.discrete_model import DiscreteModel from systemds.scuro.modality.joined import JoinedModality @@ -97,7 +98,8 @@ ) from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer - +from systemds.scuro.representations.vgg import VGG19 +from systemds.scuro.representations.clip import CLIPText, CLIPVisual __all__ = [ "BaseLoader", @@ -120,6 +122,7 @@ "MFCC", "Hadamard", "OpticalFlow", + "ColorHistogram", "Representation", "NPY", "JSON", @@ -169,4 +172,7 @@ "Quantile", "BandpowerFFT", "ZeroCrossingRate", + "VGG19", + "CLIPVisual", + "CLIPText", ] diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index ef1e0eeab24..382e2631ad6 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -254,7 +254,16 @@ def create_video_metadata(self, frequency, length, width, height, num_channels): md["data_layout"]["representation"] = DataLayout.NESTED_LEVEL md["data_layout"]["type"] = float md["data_layout"]["shape"] = (width, height, num_channels) + return md + def create_image_metadata(self, width, height, num_channels): + md = deepcopy(self.get_schema()) + md["width"] = width + md["height"] = height + md["num_channels"] = num_channels + md["data_layout"]["representation"] = DataLayout.SINGLE_LEVEL + md["data_layout"]["type"] = float + md["data_layout"]["shape"] = (width, height, num_channels) return md diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 373921e95c2..4ae1067c629 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -165,8 +165,9 @@ def apply_representation(self, representation): padded = np.pad( embeddings, pad_width=( - (0, padding_needed), - (0, 0), + (0, padding_needed) + if len(embeddings.shape) == 1 + else ((0, padding_needed), (0, 0)) ), mode="constant", constant_values=0, diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py new file mode 100644 index 00000000000..044d0f795a6 --- /dev/null +++ b/src/main/python/systemds/scuro/representations/clip.py @@ -0,0 +1,133 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import numpy as np +from torchvision import transforms + +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.representations.unimodal import UnimodalRepresentation +import torch +from systemds.scuro.representations.utils import save_embeddings +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.drsearch.operator_registry import register_representation +from transformers import CLIPProcessor, CLIPModel + +from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype +from systemds.scuro.utils.static_variables import get_device +from systemds.scuro.utils.torch_dataset import CustomDataset + + +@register_representation(ModalityType.VIDEO) +class CLIPVisual(UnimodalRepresentation): + def __init__(self, output_file=None): + parameters = {} + super().__init__("CLIPVisual", ModalityType.EMBEDDING, parameters) + self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to( + get_device() + ) + self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + self.output_file = output_file + + def transform(self, modality): + transformed_modality = TransformedModality(modality, self) + self.data_type = numpy_dtype_to_torch_dtype(modality.data_type) + if next(self.model.parameters()).dtype != self.data_type: + self.model = self.model.to(self.data_type) + + embeddings = self.create_visual_embeddings(modality) + + if self.output_file is not None: + save_embeddings(embeddings, self.output_file) + + transformed_modality.data = list(embeddings.values()) + return transformed_modality + + def create_visual_embeddings(self, modality): + tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()]) + dataset = CustomDataset( + modality.data, + self.data_type, + get_device(), + (modality.metadata[0]["width"], modality.metadata[0]["height"]), + tf=tf, + ) + embeddings = {} + for instance in torch.utils.data.DataLoader(dataset): + id = int(instance["id"][0]) + frames = instance["data"][0] + embeddings[id] = [] + batch_size = 64 + + for start_index in range(0, len(frames), batch_size): + end_index = min(start_index + batch_size, len(frames)) + frame_ids_range = range(start_index, end_index) + frame_batch = frames[frame_ids_range] + + inputs = self.processor(images=frame_batch, return_tensors="pt") + with torch.no_grad(): + output = self.model.get_image_features(**inputs) + + if len(output.shape) > 2: + output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1)) + + embeddings[id].extend( + torch.flatten(output, 1) + .detach() + .cpu() + .float() + .numpy() + .astype(modality.data_type) + ) + + embeddings[id] = np.array(embeddings[id]) + return embeddings + + +@register_representation(ModalityType.TEXT) +class CLIPText(UnimodalRepresentation): + def __init__(self, output_file=None): + parameters = {} + super().__init__("CLIPText", ModalityType.EMBEDDING, parameters) + self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to( + get_device() + ) + self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + self.output_file = output_file + + def transform(self, modality): + transformed_modality = TransformedModality(modality, self) + + embeddings = self.create_text_embeddings(modality.data, self.model) + + if self.output_file is not None: + save_embeddings(embeddings, self.output_file) + + transformed_modality.data = embeddings + return transformed_modality + + def create_text_embeddings(self, data, model): + embeddings = [] + for d in data: + inputs = self.processor(text=d, return_tensors="pt", padding=True) + with torch.no_grad(): + text_embedding = model.get_text_features(**inputs) + embeddings.append(text_embedding.squeeze().numpy().reshape(1, -1)) + + return embeddings diff --git a/src/main/python/systemds/scuro/representations/color_histogram.py b/src/main/python/systemds/scuro/representations/color_histogram.py new file mode 100644 index 00000000000..6412b1979df --- /dev/null +++ b/src/main/python/systemds/scuro/representations/color_histogram.py @@ -0,0 +1,111 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import numpy as np +import cv2 + +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.representations.unimodal import UnimodalRepresentation +from systemds.scuro.modality.transformed import TransformedModality + + +class ColorHistogram(UnimodalRepresentation): + def __init__( + self, + color_space="RGB", + bins=32, + normalize=True, + aggregation="mean", + output_file=None, + ): + super().__init__( + "ColorHistogram", ModalityType.EMBEDDING, self._get_parameters() + ) + self.color_space = color_space + self.bins = bins + self.normalize = normalize + self.aggregation = aggregation + self.output_file = output_file + + def _get_parameters(self): + return { + "color_space": ["RGB", "HSV", "GRAY"], + "bins": [8, 16, 32, 64, 128, 256, (8, 8, 8), (16, 16, 16)], + "normalize": [True, False], + "aggregation": ["mean", "max", "concat"], + } + + def compute_histogram(self, image): + if self.color_space == "HSV": + img = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) + channels = [0, 1, 2] + elif self.color_space == "GRAY": + img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + channels = [0] + else: + img = image + channels = [0, 1, 2] + + hist = self._region_histogram(img, channels) + return hist + + def _region_histogram(self, img, channels): + if isinstance(self.bins, tuple): + bins = self.bins + elif len(channels) > 1: + bins = [self.bins] * len(channels) + else: + bins = [self.bins] + hist = cv2.calcHist([img], channels, None, bins, [0, 256] * len(channels)) + hist = hist.flatten() + if self.normalize: + hist_sum = np.sum(hist) + if hist_sum > 0: + hist /= hist_sum + return hist.astype(np.float32) + + def transform(self, modality): + if modality.modality_type == ModalityType.IMAGE: + images = modality.data + hist_list = [self.compute_histogram(img) for img in images] + transformed_modality = TransformedModality( + modality, self, ModalityType.EMBEDDING + ) + transformed_modality.data = hist_list + return transformed_modality + elif modality.modality_type == ModalityType.VIDEO: + embeddings = [] + for vid in modality.data: + frame_hists = [self.compute_histogram(frame) for frame in vid] + if self.aggregation == "mean": + hist = np.mean(frame_hists, axis=0) + elif self.aggregation == "max": + hist = np.max(frame_hists, axis=0) + elif self.aggregation == "concat": + hist = np.concatenate(frame_hists) + embeddings.append(hist) + transformed_modality = TransformedModality( + modality, self, ModalityType.EMBEDDING + ) + transformed_modality.data = embeddings + return transformed_modality + else: + raise ValueError("Unsupported data format for HistogramRepresentation") diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 7bb94d8bfde..f544e6a46fc 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -144,17 +144,21 @@ def hook( embeddings[video_id] = [] batch_size = 64 + if modality.modality_type == ModalityType.IMAGE: + frames = frames.unsqueeze(0) + for start_index in range(0, len(frames), batch_size): end_index = min(start_index + batch_size, len(frames)) frame_ids_range = range(start_index, end_index) frame_batch = frames[frame_ids_range] _ = self.model(frame_batch) - values = res5c_output - pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) + output = res5c_output + if len(output.shape) > 2: + output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1)) embeddings[video_id].extend( - torch.flatten(pooled, 1) + torch.flatten(output, 1) .detach() .cpu() .float() diff --git a/src/main/python/systemds/scuro/representations/swin_video_transformer.py b/src/main/python/systemds/scuro/representations/swin_video_transformer.py index c0b7ab38ab0..e8511dd0cf7 100644 --- a/src/main/python/systemds/scuro/representations/swin_video_transformer.py +++ b/src/main/python/systemds/scuro/representations/swin_video_transformer.py @@ -34,7 +34,7 @@ from systemds.scuro.utils.static_variables import get_device -# @register_representation([ModalityType.VIDEO]) +@register_representation([ModalityType.VIDEO]) class SwinVideoTransformer(UnimodalRepresentation): def __init__(self, layer_name="avgpool"): parameters = { @@ -50,7 +50,7 @@ def __init__(self, layer_name="avgpool"): ], } self.data_type = torch.float - super().__init__("SwinVideoTransformer", ModalityType.TIMESERIES, parameters) + super().__init__("SwinVideoTransformer", ModalityType.EMBEDDING, parameters) self.layer_name = layer_name self.model = swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1).to( get_device() @@ -95,6 +95,7 @@ def hook( .detach() .cpu() .numpy() + .flatten() .astype(modality.data_type) ) diff --git a/src/main/python/systemds/scuro/representations/vgg.py b/src/main/python/systemds/scuro/representations/vgg.py new file mode 100644 index 00000000000..374586f2b9a --- /dev/null +++ b/src/main/python/systemds/scuro/representations/vgg.py @@ -0,0 +1,128 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype +from systemds.scuro.utils.torch_dataset import CustomDataset +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.representations.unimodal import UnimodalRepresentation +from typing import Tuple, Any +from systemds.scuro.drsearch.operator_registry import register_representation +import torch.utils.data +import torch +import re +import torchvision.models as models +import numpy as np +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.utils.static_variables import get_device + + +@register_representation([ModalityType.IMAGE, ModalityType.VIDEO]) +class VGG19(UnimodalRepresentation): + def __init__(self, layer="classifier.0", output_file=None): + self.data_type = torch.bfloat16 + self.model = models.vgg19(weights=models.VGG19_Weights.DEFAULT).to(get_device()) + parameters = self._get_parameters() + super().__init__("VGG19", ModalityType.EMBEDDING, parameters) + self.output_file = output_file + self.layer_name = layer + self.model.eval() + for param in self.model.parameters(): + param.requires_grad = False + + class Identity(torch.nn.Module): + def forward(self, input_: torch.Tensor) -> torch.Tensor: + return input_ + + self.model.fc = Identity() + + def _get_parameters(self): + parameters = {"layer_name": []} + + parameters["layer_name"] = [ + "features.35", + "classifier.0", + "classifier.3", + "classifier.6", + ] + + return parameters + + def transform(self, modality): + self.data_type = numpy_dtype_to_torch_dtype(modality.data_type) + if next(self.model.parameters()).dtype != self.data_type: + self.model = self.model.to(self.data_type) + + dataset = CustomDataset(modality.data, self.data_type, get_device()) + embeddings = {} + + activations = {} + + def get_activation(name_): + def hook( + _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: Any + ): + activations[name_] = output + + return hook + + digit = re.findall(r"\d+", self.layer_name)[0] + if "feature" in self.layer_name: + self.model.features[int(digit)].register_forward_hook( + get_activation(self.layer_name) + ) + else: + + self.model.classifier[int(digit)].register_forward_hook( + get_activation(self.layer_name) + ) + + for instance in torch.utils.data.DataLoader(dataset): + video_id = instance["id"][0] + frames = instance["data"][0] + embeddings[video_id] = [] + batch_size = 32 + + for start_index in range(0, len(frames), batch_size): + end_index = min(start_index + batch_size, len(frames)) + frame_ids_range = range(start_index, end_index) + frame_batch = frames[frame_ids_range] + + _ = self.model(frame_batch) + output = activations[self.layer_name] + if len(output.shape) == 4: + output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1)) + embeddings[video_id].extend( + torch.flatten(output, 1) + .detach() + .cpu() + .float() + .numpy() + .astype(modality.data_type) + ) + + embeddings[video_id] = np.array(embeddings[video_id]) + + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + + transformed_modality.data = list(embeddings.values()) + + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py index 1629ac6f309..7701865f828 100644 --- a/src/main/python/systemds/scuro/representations/x3d.py +++ b/src/main/python/systemds/scuro/representations/x3d.py @@ -18,34 +18,27 @@ # under the License. # # ------------------------------------------------------------- +from systemds.scuro.utils.static_variables import get_device from systemds.scuro.utils.torch_dataset import CustomDataset from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation -from typing import Callable, Dict, Tuple, Any +from typing import Tuple, Any import torch.utils.data import torch from torchvision.models.video import r3d_18, s3d import torchvision.models as models -import torchvision.transforms as transforms import numpy as np from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation -import math -if torch.backends.mps.is_available(): - DEVICE = torch.device("mps") -elif torch.cuda.is_available(): - DEVICE = torch.device("cuda") -else: - DEVICE = torch.device("cpu") - -# @register_representation([ModalityType.VIDEO]) +@register_representation([ModalityType.VIDEO]) class X3D(UnimodalRepresentation): - def __init__(self, layer="avgpool", model_name="r3d", output_file=None): + def __init__(self, layer="classifier.1", model_name="s3d", output_file=None): + self.data_type = torch.float32 self.model_name = model_name parameters = self._get_parameters() - super().__init__("X3D", ModalityType.TIMESERIES, parameters) + super().__init__("X3D", ModalityType.EMBEDDING, parameters) self.output_file = output_file self.layer_name = layer @@ -67,25 +60,37 @@ def model_name(self): def model_name(self, model_name): self._model_name = model_name if model_name == "r3d": - self.model = r3d_18(pretrained=True).to(DEVICE) + self.model = r3d_18(pretrained=True).to(get_device()) elif model_name == "s3d": - self.model = s3d(weights=models.video.S3D_Weights.DEFAULT).to(DEVICE) + self.model = s3d(weights=models.video.S3D_Weights.DEFAULT).to(get_device()) else: raise NotImplementedError def _get_parameters(self, high_level=True): parameters = {"model_name": [], "layer_name": []} - for m in ["r3d", "s3d"]: + for m in ["c3d", "s3d"]: parameters["model_name"].append(m) if high_level: parameters["layer_name"] = [ - "conv1", - "layer1", - "layer2", - "layer3", - "layer4", + "features.1", + "features.2", + "features.3", + "features.4", + "features.5", + "features.6", + "features.7", + "features.8", + "features.9", + "features.10", + "features.11", + "features.12", + "features.13", + "features.14", + "features.15", "avgpool", + "classifier.0", + "classifier.1", ] else: for name, layer in self.model.named_modules(): @@ -93,17 +98,18 @@ def _get_parameters(self, high_level=True): return parameters def transform(self, modality): - dataset = CustomDataset(modality.data) + dataset = CustomDataset(modality.data, self.data_type, get_device()) + embeddings = {} - res5c_output = None + activation = None def get_features(name_): def hook( _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: Any ): - nonlocal res5c_output - res5c_output = output + nonlocal activation + activation = output return hook @@ -115,15 +121,20 @@ def hook( for instance in dataset: video_id = instance["id"] - frames = instance["data"].to(DEVICE) + frames = instance["data"].to(get_device()) embeddings[video_id] = [] frames = frames.unsqueeze(0).permute(0, 2, 1, 3, 4) + if frames.shape[2] < 14: + pad_width = (0, 0, 0, 0, 0, 14 - frames.shape[2], 0, 0, 0, 0) + frames = torch.nn.functional.pad(frames, pad_width, mode="constant") _ = self.model(frames) - values = res5c_output + values = activation pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) - embeddings[video_id].extend(torch.flatten(pooled, 1).detach().cpu().numpy()) + embeddings[video_id].extend( + torch.flatten(pooled, 1).detach().cpu().numpy().flatten() + ) embeddings[video_id] = np.array(embeddings[video_id]) @@ -137,13 +148,13 @@ def hook( class I3D(UnimodalRepresentation): - def __init__(self, layer="avgpool", model_name="i3d", output_file=None): + def __init__(self, layer="blocks.6", model_name="i3d", output_file=None): self.model_name = model_name - parameters = self._get_parameters() self.model = torch.hub.load( "facebookresearch/pytorchvideo", "i3d_r50", pretrained=True - ).to(DEVICE) - super().__init__("I3D", ModalityType.TIMESERIES, parameters) + ).to(get_device()) + parameters = self._get_parameters() + super().__init__("I3D", ModalityType.EMBEDDING, parameters) self.output_file = output_file self.layer_name = layer @@ -152,18 +163,17 @@ def __init__(self, layer="avgpool", model_name="i3d", output_file=None): param.requires_grad = False def _get_parameters(self, high_level=True): - parameters = {"model_name": [], "layer_name": []} - for m in ["r3d", "s3d"]: - parameters["model_name"].append(m) + parameters = {"layer_name": []} if high_level: parameters["layer_name"] = [ - "conv1", - "layer1", - "layer2", - "layer3", - "layer4", - "avgpool", + "blocks.0", + "blocks.1", + "blocks.2", + "blocks.3", + "blocks.4", + "blocks.5", + "blocks.6", ] else: for name, layer in self.model.named_modules(): @@ -171,28 +181,37 @@ def _get_parameters(self, high_level=True): return parameters def transform(self, modality): - dataset = CustomDataset(modality.data, torch.float32, DEVICE) + dataset = CustomDataset(modality.data, torch.float32, get_device()) embeddings = {} features = None - def hook(module, input, output): - pooled = torch.nn.functional.adaptive_avg_pool3d(output, 1).squeeze() - nonlocal features - features = pooled.detach().cpu().numpy() + def get_features(name_): + def hook( + _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: Any + ): + # pooled = torch.nn.functional.adaptive_avg_pool3d(output, 1).squeeze() + nonlocal features + features = output.detach().cpu().numpy() + + return hook - handle = self.model.blocks[6].dropout.register_forward_hook(hook) + if self.layer_name: + for name, layer in self.model.named_modules(): + if name == self.layer_name: + layer.register_forward_hook(get_features(name)) + break for instance in dataset: video_id = instance["id"] - frames = instance["data"].to(DEVICE) + frames = instance["data"].to(get_device()) embeddings[video_id] = [] batch = torch.transpose(frames, 1, 0) batch = batch.unsqueeze(0) _ = self.model(batch) - embeddings[video_id] = features + embeddings[video_id] = features.flatten() transformed_modality = TransformedModality( modality, self, self.output_modality_type diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py index c04be0ec7b6..2a7ec1f963b 100644 --- a/src/main/python/systemds/scuro/utils/torch_dataset.py +++ b/src/main/python/systemds/scuro/utils/torch_dataset.py @@ -20,12 +20,13 @@ # ------------------------------------------------------------- from typing import Dict +import numpy as np import torch import torchvision.transforms as transforms class CustomDataset(torch.utils.data.Dataset): - def __init__(self, data, data_type, device, size=None): + def __init__(self, data, data_type, device, size=None, tf=None): self.data = data self.data_type = data_type self.device = device @@ -33,7 +34,7 @@ def __init__(self, data, data_type, device, size=None): if size is None: self.size = (256, 224) - self.tf = transforms.Compose( + tf_default = transforms.Compose( [ transforms.ToPILImage(), transforms.Resize(self.size[0]), @@ -46,6 +47,11 @@ def __init__(self, data, data_type, device, size=None): ] ) + if tf is None: + self.tf = tf_default + else: + self.tf = tf + def __getitem__(self, index) -> Dict[str, object]: data = self.data[index] output = torch.empty( @@ -54,12 +60,20 @@ def __getitem__(self, index) -> Dict[str, object]: device=self.device, ) - for i, d in enumerate(data): - if data[0].ndim < 3: - d = torch.tensor(d) - d = d.repeat(3, 1, 1) + if isinstance(data, np.ndarray) and data.ndim == 3: + # image + data = torch.tensor(data).permute(2, 0, 1) + output = self.tf(data).to(self.device) + else: + for i, d in enumerate(data): + if data[0].ndim < 3: + d = torch.tensor(d) + d = d.repeat(3, 1, 1) - output[i] = self.tf(d) + tf = self.tf(d) + if tf.shape[0] != 3: + tf = tf[:3, :, :] + output[i] = tf return {"id": index, "data": output} diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 11f034d9cef..76769065054 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -195,26 +195,38 @@ def create_text_data(self, num_instances): def create_visual_modality( self, num_instances, max_num_frames=1, height=28, width=28 ): - data = [ - np.random.randint( - 0, - 256, - (np.random.randint(5, max_num_frames + 1), height, width, 3), - dtype=np.uint8, - ) - for _ in range(num_instances) - ] - if max_num_frames == 1: - print(f"TODO: create image metadata") - else: + if max_num_frames > 1: + data = [ + np.random.randint( + 0, + 256, + (np.random.randint(1, max_num_frames + 1), height, width, 3), + dtype=np.uint8, + ) + for _ in range(num_instances) + ] metadata = { i: ModalityType.VIDEO.create_video_metadata( 30, data[i].shape[0], width, height, 3 ) for i in range(num_instances) } + else: + data = [ + np.random.randint( + 0, + 256, + (height, width, 3), + dtype=np.uint8, + ) + for _ in range(num_instances) + ] + metadata = { + i: ModalityType.IMAGE.create_image_metadata(width, height, 3) + for i in range(num_instances) + } - return (data, metadata) + return data, metadata def create_balanced_labels(self, num_instances, num_classes=2): if num_instances % num_classes != 0: diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index b5fa4b01b4d..c33eb5fcc2b 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -21,6 +21,7 @@ import unittest +from systemds.scuro import FrequencyMagnitude from systemds.scuro.representations.covarep_audio_features import ( ZeroCrossing, Spectral, @@ -29,6 +30,9 @@ ) from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer +from systemds.scuro.representations.clip import CLIPText, CLIPVisual +from systemds.scuro.representations.vgg import VGG19 +from systemds.scuro.representations.x3d import X3D, I3D from systemds.scuro.representations.wav2vec import Wav2Vec from systemds.scuro.representations.window_aggregation import ( WindowAggregation, @@ -39,6 +43,22 @@ from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.tfidf import TfIdf from systemds.scuro.drsearch.operator_registry import Registry +from systemds.scuro.representations.timeseries_representations import ( + Max, + Mean, + Min, + RMS, + Sum, + Std, + Skew, + Kurtosis, + SpectralCentroid, + BandpowerFFT, + ACF, + Quantile, + ZeroCrossingRate, + FrequencyMagnitude, +) from systemds.scuro.modality.type import ModalityType from systemds.scuro.representations.average import Average from systemds.scuro.representations.bert import Bert @@ -49,7 +69,6 @@ from systemds.scuro.representations.spectrogram import Spectrogram from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.resnet import ResNet -from systemds.scuro.representations.sum import Sum from systemds.scuro.representations.multimodal_attention_fusion import AttentionFusion @@ -71,16 +90,34 @@ def test_video_representations_in_registry(self): registry = Registry() assert registry.get_representations(ModalityType.VIDEO) == [ ResNet, - # SwinVideoTransformer, + SwinVideoTransformer, + X3D, + VGG19, + CLIPVisual, ] - # def test_timeseries_representations_in_registry(self): - # registry = Registry() - # assert registry.get_representations(ModalityType.TIMESERIES) == [ResNet] + def test_timeseries_representations_in_registry(self): + registry = Registry() + assert registry.get_representations(ModalityType.TIMESERIES) == [ + Mean, + Min, + Max, + Sum, + Std, + Skew, + Quantile, + Kurtosis, + RMS, + ZeroCrossingRate, + ACF, + FrequencyMagnitude, + SpectralCentroid, + BandpowerFFT, + ] def test_text_representations_in_registry(self): registry = Registry() - for representation in [BoW, TfIdf, W2V, Bert]: + for representation in [CLIPText, BoW, TfIdf, W2V, Bert]: assert representation in registry.get_representations( ModalityType.TEXT ), f"{representation} not in registry" diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index 6789786cfd0..3bc28ee23c5 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -23,6 +23,7 @@ import copy import numpy as np +from systemds.scuro.representations.clip import CLIPVisual, CLIPText from systemds.scuro.representations.bow import BoW from systemds.scuro.representations.covarep_audio_features import ( Spectral, @@ -34,6 +35,9 @@ from systemds.scuro.representations.spectrogram import Spectrogram from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.tfidf import TfIdf +from systemds.scuro.representations.x3d import X3D +from systemds.scuro.representations.x3d import I3D +from systemds.scuro.representations.color_histogram import ColorHistogram from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.mfcc import MFCC @@ -59,6 +63,7 @@ ZeroCrossingRate, BandpowerFFT, ) +from systemds.scuro.representations.vgg import VGG19 class TestUnimodalRepresentations(unittest.TestCase): @@ -143,11 +148,34 @@ def test_timeseries_representations(self): for i in range(self.num_instances): assert (ts.data[i] == original_data[i]).all() + def test_image_representations(self): + image_representations = [ColorHistogram(), CLIPVisual(), ResNet()] + + image_data, image_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 1 + ) + + image = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.IMAGE, image_data, np.float32, image_md + ) + ) + + for representation in image_representations: + r = image.apply_representation(representation) + assert r.data is not None + assert len(r.data) == self.num_instances + def test_video_representations(self): video_representations = [ + CLIPVisual(), + ColorHistogram(), + I3D(), + X3D(), + VGG19(), ResNet(), SwinVideoTransformer(), - ] # Todo: add other video representations + ] video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( self.num_instances, 60 ) @@ -160,10 +188,9 @@ def test_video_representations(self): r = video.apply_representation(representation) assert r.data is not None assert len(r.data) == self.num_instances - assert r.data[0].ndim == 2 def test_text_representations(self): - test_representations = [BoW(2, 2), TfIdf(), W2V()] + test_representations = [CLIPText(), BoW(2, 2), TfIdf(), W2V()] text_data, text_md = ModalityRandomDataGenerator().create_text_data( self.num_instances )