Skip to content

Commit 7dde438

Browse files
[SYSTEMDS-3939] Add MLP-Aggregation operator to Scuro
This patch adds the initial skeleton for the dimensionality reduction operators. Additionally, it adds the implementation of the MLP-Aggregation operator.
1 parent 38d348e commit 7dde438

File tree

15 files changed

+579
-97
lines changed

15 files changed

+579
-97
lines changed

.github/workflows/python.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ jobs:
142142
export PATH=$SYSTEMDS_ROOT/bin:$PATH
143143
cd src/main/python
144144
./tests/federated/runFedTest.sh
145-
145+
146146
- name: Cache Torch Hub
147147
if: ${{ matrix.test_mode == 'scuro' }}
148148
id: torch-cache
@@ -158,6 +158,8 @@ jobs:
158158
env:
159159
TORCH_HOME: ${{ github.workspace }}/.torch
160160
run: |
161+
df -h
162+
exit
161163
( while true; do echo "."; sleep 25; done ) &
162164
KA=$!
163165
pip install --upgrade pip wheel setuptools

src/main/python/systemds/scuro/__init__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,13 @@
116116
OverlappingSplitIndices,
117117
)
118118
from systemds.scuro.representations.elmo import ELMoRepresentation
119-
119+
from systemds.scuro.representations.dimensionality_reduction import (
120+
DimensionalityReduction,
121+
)
122+
from systemds.scuro.representations.mlp_averaging import MLPAveraging
123+
from systemds.scuro.representations.mlp_learned_dim_reduction import (
124+
MLPLearnedDimReduction,
125+
)
120126

121127
__all__ = [
122128
"BaseLoader",
@@ -202,4 +208,7 @@
202208
"ELMoRepresentation",
203209
"SentenceBoundarySplitIndices",
204210
"OverlappingSplitIndices",
211+
"MLPAveraging",
212+
"MLPLearnedDimReduction",
213+
"DimensionalityReduction",
205214
]

src/main/python/systemds/scuro/drsearch/operator_registry.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class Registry:
3737
_fusion_operators = []
3838
_text_context_operators = []
3939
_video_context_operators = []
40+
_dimensionality_reduction_operators = {}
4041

4142
def __new__(cls):
4243
if not cls._instance:
@@ -73,6 +74,18 @@ def add_context_operator(self, context_operator, modality_type):
7374
def add_fusion_operator(self, fusion_operator):
7475
self._fusion_operators.append(fusion_operator)
7576

77+
def add_dimensionality_reduction_operator(
78+
self, dimensionality_reduction_operator, modality_type
79+
):
80+
if not isinstance(modality_type, list):
81+
modality_type = [modality_type]
82+
for m_type in modality_type:
83+
if not m_type in self._dimensionality_reduction_operators.keys():
84+
self._dimensionality_reduction_operators[m_type] = []
85+
self._dimensionality_reduction_operators[m_type].append(
86+
dimensionality_reduction_operator
87+
)
88+
7689
def get_representations(self, modality: ModalityType):
7790
return self._representations[modality]
7891

@@ -86,6 +99,9 @@ def get_not_self_contained_representations(self, modality: ModalityType):
8699
def get_context_operators(self, modality_type):
87100
return self._context_operators[modality_type]
88101

102+
def get_dimensionality_reduction_operators(self, modality_type):
103+
return self._dimensionality_reduction_operators[modality_type]
104+
89105
def get_fusion_operators(self):
90106
return self._fusion_operators
91107

@@ -127,6 +143,18 @@ def decorator(cls):
127143
return decorator
128144

129145

146+
def register_dimensionality_reduction_operator(modality_type):
147+
"""
148+
Decorator to register a dimensionality reduction operator.
149+
"""
150+
151+
def decorator(cls):
152+
Registry().add_dimensionality_reduction_operator(cls, modality_type)
153+
return cls
154+
155+
return decorator
156+
157+
130158
def register_context_operator(modality_type):
131159
"""
132160
Decorator to register a context operator.

src/main/python/systemds/scuro/drsearch/representation_dag.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@
3030
AggregatedRepresentation,
3131
)
3232
from systemds.scuro.representations.context import Context
33+
from systemds.scuro.representations.dimensionality_reduction import (
34+
DimensionalityReduction,
35+
)
3336
from systemds.scuro.utils.identifier import get_op_id, get_node_id
3437

3538
from collections import OrderedDict
@@ -195,6 +198,8 @@ def execute_node(node_id: str, task) -> TransformedModality:
195198
# It's a unimodal operation
196199
if isinstance(node_operation, Context):
197200
result = input_mods[0].context(node_operation)
201+
elif isinstance(node_operation, DimensionalityReduction):
202+
result = input_mods[0].dimensionality_reduction(node_operation)
198203
elif isinstance(node_operation, AggregatedRepresentation):
199204
result = node_operation.transform(input_mods[0])
200205
elif isinstance(node_operation, UnimodalRepresentation):

src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import multiprocessing as mp
2626
from typing import List, Any
2727
from functools import lru_cache
28-
from systemds.scuro.drsearch.task import Task
2928
from systemds.scuro import ModalityType
3029
from systemds.scuro.drsearch.ranking import rank_by_tradeoff
3130
from systemds.scuro.drsearch.task import PerformanceMeasure
@@ -92,6 +91,12 @@ def _get_not_self_contained_reps(self, modality_type):
9291
def _get_context_operators(self, modality_type):
9392
return self.operator_registry.get_context_operators(modality_type)
9493

94+
@lru_cache(maxsize=32)
95+
def _get_dimensionality_reduction_operators(self, modality_type):
96+
return self.operator_registry.get_dimensionality_reduction_operators(
97+
modality_type
98+
)
99+
95100
def store_results(self, file_name=None):
96101
if file_name is None:
97102
import time
@@ -185,9 +190,7 @@ def _process_modality(self, modality, parallel):
185190

186191
external_cache = LRUCache(max_size=32)
187192
for dag in dags:
188-
representations = dag.execute(
189-
[modality], task=self.tasks[0], external_cache=external_cache
190-
) # TODO: dynamic task selection
193+
representations = dag.execute([modality], external_cache=external_cache)
191194
node_id = list(representations.keys())[-1]
192195
node = dag.get_node_by_id(node_id)
193196
if node.operation is None:
@@ -303,6 +306,27 @@ def _evaluate_local(self, modality, local_results, dag, combination=None):
303306
scores, modality, task.model.name, end - start, combination, dag
304307
)
305308

309+
def add_dimensionality_reduction_operators(self, builder, current_node_id):
310+
dags = []
311+
modality_type = (
312+
builder.get_node(current_node_id).operation().output_modality_type
313+
)
314+
315+
if modality_type is not ModalityType.EMBEDDING:
316+
return None
317+
318+
dimensionality_reduction_operators = (
319+
self._get_dimensionality_reduction_operators(modality_type)
320+
)
321+
for dimensionality_reduction_op in dimensionality_reduction_operators:
322+
dimensionality_reduction_node_id = builder.create_operation_node(
323+
dimensionality_reduction_op,
324+
[current_node_id],
325+
dimensionality_reduction_op().get_current_parameters(),
326+
)
327+
dags.append(builder.build(dimensionality_reduction_node_id))
328+
return dags
329+
306330
def _build_modality_dag(
307331
self, modality: Modality, operator: Any
308332
) -> List[RepresentationDag]:
@@ -316,6 +340,12 @@ def _build_modality_dag(
316340
current_node_id = rep_node_id
317341
dags.append(builder.build(current_node_id))
318342

343+
dimensionality_reduction_dags = self.add_dimensionality_reduction_operators(
344+
builder, current_node_id
345+
)
346+
if dimensionality_reduction_dags is not None:
347+
dags.extend(dimensionality_reduction_dags)
348+
319349
if operator.needs_context:
320350
context_operators = self._get_context_operators(modality.modality_type)
321351
for context_op in context_operators:
@@ -339,6 +369,11 @@ def _build_modality_dag(
339369
[context_node_id],
340370
operator.get_current_parameters(),
341371
)
372+
dimensionality_reduction_dags = self.add_dimensionality_reduction_operators(
373+
builder, context_rep_node_id
374+
) # TODO: check if this is correctly using the 3d approach of the dimensionality reduction operator
375+
if dimensionality_reduction_dags is not None:
376+
dags.extend(dimensionality_reduction_dags)
342377

343378
agg_operator = AggregatedRepresentation()
344379
context_agg_node_id = builder.create_operation_node(

src/main/python/systemds/scuro/modality/transformed.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,15 @@ def context(self, context_operator):
122122
transformed_modality.transform_time += time.time() - start
123123
return transformed_modality
124124

125+
def dimensionality_reduction(self, dimensionality_reduction_operator):
126+
transformed_modality = TransformedModality(
127+
self, dimensionality_reduction_operator, self_contained=self.self_contained
128+
)
129+
start = time.time()
130+
transformed_modality.data = dimensionality_reduction_operator.execute(self.data)
131+
transformed_modality.transform_time += time.time() - start
132+
return transformed_modality
133+
125134
def apply_representation(self, representation):
126135
start = time.time()
127136
new_modality = representation.transform(self)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# -------------------------------------------------------------
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
#
20+
# -------------------------------------------------------------
21+
import abc
22+
23+
import numpy as np
24+
25+
from systemds.scuro.modality.modality import Modality
26+
from systemds.scuro.representations.representation import Representation
27+
28+
29+
class DimensionalityReduction(Representation):
30+
def __init__(self, name, parameters=None):
31+
"""
32+
Parent class for different dimensionality reduction operations
33+
:param name: Name of the dimensionality reduction operator
34+
"""
35+
super().__init__(name, parameters)
36+
self.needs_training = False
37+
38+
@abc.abstractmethod
39+
def execute(self, data, labels=None):
40+
"""
41+
Implemented for every child class and creates a sampled representation for a given modality
42+
:param data: data to apply the dimensionality reduction on
43+
:param labels: labels for learned dimensionality reduction
44+
:return: dimensionality reduced data
45+
"""
46+
if labels is not None:
47+
self.execute_with_training(data, labels)
48+
else:
49+
self.execute(data)
50+
51+
def apply_representation(self, data):
52+
"""
53+
Implemented for every child class and creates a dimensionality reduced representation for a given modality
54+
:param data: data to apply the representation on
55+
:return: dimensionality reduced data
56+
"""
57+
raise f"Not implemented for Dimensionality Reduction Operator: {self.name}"
58+
59+
def execute_with_training(self, modality, task):
60+
fusion_train_indices = task.fusion_train_indices
61+
# Handle 3d data
62+
data = modality.data
63+
if (
64+
len(np.array(modality.data).shape) == 3
65+
and np.array(modality.data).shape[1] == 1
66+
):
67+
data = np.array([x.reshape(-1) for x in modality.data])
68+
transformed_train = self.execute(
69+
np.array(data)[fusion_train_indices], task.labels[fusion_train_indices]
70+
)
71+
72+
all_other_indices = [
73+
i for i in range(len(modality.data)) if i not in fusion_train_indices
74+
]
75+
transformed_other = self.apply_representation(np.array(data)[all_other_indices])
76+
77+
transformed_data = np.zeros((len(data), transformed_train.shape[1]))
78+
transformed_data[fusion_train_indices] = transformed_train
79+
transformed_data[all_other_indices] = transformed_other
80+
81+
return transformed_data

src/main/python/systemds/scuro/representations/glove.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,20 @@ def transform(self, modality):
5959
glove_embeddings = load_glove_embeddings(self.glove_path)
6060

6161
embeddings = []
62+
embedding_dim = (
63+
len(next(iter(glove_embeddings.values()))) if glove_embeddings else 100
64+
)
65+
6266
for sentences in modality.data:
6367
tokens = list(tokenize(sentences.lower()))
64-
embeddings.append(
65-
np.mean(
66-
[
67-
glove_embeddings[token]
68-
for token in tokens
69-
if token in glove_embeddings
70-
],
71-
axis=0,
72-
)
73-
)
68+
token_embeddings = [
69+
glove_embeddings[token] for token in tokens if token in glove_embeddings
70+
]
71+
72+
if len(token_embeddings) > 0:
73+
embeddings.append(np.mean(token_embeddings, axis=0))
74+
else:
75+
embeddings.append(np.zeros(embedding_dim, dtype=np.float32))
7476

7577
if self.output_file is not None:
7678
save_embeddings(np.array(embeddings), self.output_file)

0 commit comments

Comments
 (0)