Skip to content

Commit 23b3d08

Browse files
[SYSTEMDS-3887] Create representation optimizer
This patch adds an initial version of the representation optimizer for the Scuro library. It is a two stage optimization where in the first step the best unimodal representation for given raw modalities is found and in the next step the k-best unimodal rerpesentations are combined into multimodal representations and evaluated against the target downstream task. Additionally, this patch adds tests for each stage of the optimizer. Closes #2267
1 parent 38b73ae commit 23b3d08

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2361
-278
lines changed

src/main/python/systemds/scuro/__init__.py

Lines changed: 68 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,55 @@
2424
from systemds.scuro.dataloader.text_loader import TextLoader
2525
from systemds.scuro.dataloader.json_loader import JSONLoader
2626
from systemds.scuro.representations.representation import Representation
27+
from systemds.scuro.representations.aggregate import Aggregation
28+
from systemds.scuro.representations.aggregated_representation import (
29+
AggregatedRepresentation,
30+
)
2731
from systemds.scuro.representations.average import Average
32+
from systemds.scuro.representations.bert import Bert
33+
from systemds.scuro.representations.bow import BoW
2834
from systemds.scuro.representations.concatenation import Concatenation
29-
from systemds.scuro.representations.sum import Sum
35+
from systemds.scuro.representations.context import Context
36+
from systemds.scuro.representations.fusion import Fusion
37+
from systemds.scuro.representations.glove import GloVe
38+
from systemds.scuro.representations.lstm import LSTM
3039
from systemds.scuro.representations.max import RowMax
31-
from systemds.scuro.representations.multiplication import Multiplication
3240
from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
41+
from systemds.scuro.representations.mfcc import MFCC
42+
from systemds.scuro.representations.multiplication import Multiplication
43+
from systemds.scuro.representations.optical_flow import OpticalFlow
44+
from systemds.scuro.representations.representation import Representation
45+
from systemds.scuro.representations.representation_dataloader import NPY
46+
from systemds.scuro.representations.representation_dataloader import JSON
47+
from systemds.scuro.representations.representation_dataloader import Pickle
3348
from systemds.scuro.representations.resnet import ResNet
34-
from systemds.scuro.representations.bert import Bert
35-
from systemds.scuro.representations.lstm import LSTM
36-
from systemds.scuro.representations.bow import BoW
37-
from systemds.scuro.representations.glove import GloVe
49+
from systemds.scuro.representations.spectrogram import Spectrogram
50+
from systemds.scuro.representations.sum import Sum
51+
from systemds.scuro.representations.swin_video_transformer import SwinVideoTransformer
3852
from systemds.scuro.representations.tfidf import TfIdf
53+
from systemds.scuro.representations.unimodal import UnimodalRepresentation
54+
from systemds.scuro.representations.wav2vec import Wav2Vec
55+
from systemds.scuro.representations.window import WindowAggregation
3956
from systemds.scuro.representations.word2vec import W2V
57+
from systemds.scuro.representations.x3d import X3D
4058
from systemds.scuro.models.model import Model
4159
from systemds.scuro.models.discrete_model import DiscreteModel
60+
from systemds.scuro.modality.joined import JoinedModality
61+
from systemds.scuro.modality.joined_transformed import JoinedTransformedModality
4262
from systemds.scuro.modality.modality import Modality
43-
from systemds.scuro.modality.unimodal_modality import UnimodalModality
63+
from systemds.scuro.modality.modality_identifier import ModalityIdentifier
4464
from systemds.scuro.modality.transformed import TransformedModality
4565
from systemds.scuro.modality.type import ModalityType
46-
from systemds.scuro.aligner.dr_search import DRSearch
47-
from systemds.scuro.aligner.task import Task
66+
from systemds.scuro.modality.unimodal_modality import UnimodalModality
67+
from systemds.scuro.drsearch.dr_search import DRSearch
68+
from systemds.scuro.drsearch.task import Task
69+
from systemds.scuro.drsearch.fusion_optimizer import FusionOptimizer
70+
from systemds.scuro.drsearch.operator_registry import Registry
71+
from systemds.scuro.drsearch.optimization_data import OptimizationData
72+
from systemds.scuro.drsearch.representation_cache import RepresentationCache
73+
from systemds.scuro.drsearch.unimodal_representation_optimizer import (
74+
UnimodalRepresentationOptimizer,
75+
)
4876

4977

5078
__all__ = [
@@ -53,25 +81,50 @@
5381
"VideoLoader",
5482
"TextLoader",
5583
"Representation",
84+
"Aggregation",
85+
"AggregatedRepresentation",
5686
"Average",
87+
"Bert",
88+
"BoW",
5789
"Concatenation",
58-
"Sum",
90+
"Context",
91+
"Fusion",
92+
"GloVe",
93+
"LSTM",
5994
"RowMax",
60-
"Multiplication",
6195
"MelSpectrogram",
96+
"MFCC",
97+
"Multiplication",
98+
"OpticalFlow",
99+
"Representation",
100+
"NPY",
101+
"JSON",
102+
"Pickle",
62103
"ResNet",
63-
"Bert",
64-
"LSTM",
104+
"Spectrogram",
105+
"Sum",
65106
"BoW",
66-
"GloVe",
107+
"SwinVideoTransformer",
67108
"TfIdf",
109+
"UnimodalRepresentation",
110+
"Wav2Vec",
111+
"WindowAggregation",
68112
"W2V",
113+
"X3D",
69114
"Model",
70115
"DiscreteModel",
116+
"JoinedModality",
117+
"JoinedTransformedModality",
71118
"Modality",
72-
"UnimodalModality",
119+
"ModalityIdentifier",
73120
"TransformedModality",
74121
"ModalityType",
122+
"UnimodalModality",
75123
"DRSearch",
76124
"Task",
125+
"FusionOptimizer",
126+
"Registry",
127+
"OptimizationData",
128+
"RepresentationCache",
129+
"UnimodalRepresentationOptimizer",
77130
]

src/main/python/systemds/scuro/aligner/alignment.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

src/main/python/systemds/scuro/dataloader/audio_loader.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,22 @@
2727

2828
class AudioLoader(BaseLoader):
2929
def __init__(
30-
self, source_path: str, indices: List[str], chunk_size: Optional[int] = None
30+
self,
31+
source_path: str,
32+
indices: List[str],
33+
chunk_size: Optional[int] = None,
34+
normalize: bool = True,
3135
):
3236
super().__init__(source_path, indices, chunk_size, ModalityType.AUDIO)
37+
self.normalize = normalize
3338

3439
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
3540
self.file_sanity_check(file)
3641
audio, sr = librosa.load(file)
42+
43+
if self.normalize:
44+
audio = librosa.util.normalize(audio)
45+
3746
self.metadata[file] = self.modality_type.create_audio_metadata(sr, audio)
3847

3948
self.data.append(audio)
File renamed without changes.

src/main/python/systemds/scuro/aligner/dr_search.py renamed to src/main/python/systemds/scuro/drsearch/dr_search.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import random
2323
from typing import List
2424

25-
from systemds.scuro.aligner.task import Task
25+
from systemds.scuro.drsearch.task import Task
2626
from systemds.scuro.modality.modality import Modality
2727
from systemds.scuro.representations.representation import Representation
2828

@@ -111,7 +111,7 @@ def fit_random(self, seed=-1):
111111
representation = random.choice(self.representations)
112112

113113
modality = modality_combination[0].combine(
114-
modality_combination[1:], representation
114+
list(modality_combination[1:]), representation
115115
)
116116

117117
scores = self.task.run(modality.data)

0 commit comments

Comments
 (0)